From e000acc145928693833f09152244242a678d3cd5 Mon Sep 17 00:00:00 2001
From: Kristen Carlson Accardi <kristen@linux.intel.com>
Date: Wed, 15 Apr 2020 14:04:43 -0700
Subject: [PATCH 001/502] objtool: Do not assume order of parent/child
 functions

If a .cold function is examined prior to it's parent, the link
to the parent/child function can be overwritten when the parent
is examined. Only update pfunc and cfunc if they were previously
nil to prevent this from happening.

This fixes an issue seen when compiling with -ffunction-sections.

Signed-off-by: Kristen Carlson Accardi <kristen@linux.intel.com>
Signed-off-by: Josh Poimboeuf <jpoimboe@redhat.com>
---
 tools/objtool/elf.c | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/tools/objtool/elf.c b/tools/objtool/elf.c
index 84225679f96d..f953d3a15612 100644
--- a/tools/objtool/elf.c
+++ b/tools/objtool/elf.c
@@ -434,7 +434,13 @@ static int read_symbols(struct elf *elf)
 			size_t pnamelen;
 			if (sym->type != STT_FUNC)
 				continue;
-			sym->pfunc = sym->cfunc = sym;
+
+			if (sym->pfunc == NULL)
+				sym->pfunc = sym;
+
+			if (sym->cfunc == NULL)
+				sym->cfunc = sym;
+
 			coldstr = strstr(sym->name, ".cold");
 			if (!coldstr)
 				continue;

From 1e968bf5caf65eff3f080102879aaa5440c261b6 Mon Sep 17 00:00:00 2001
From: Sami Tolvanen <samitolvanen@google.com>
Date: Tue, 21 Apr 2020 11:25:01 -0700
Subject: [PATCH 002/502] objtool: Use sh_info to find the base for .rela
 sections

ELF doesn't require .rela section names to match the base section. Use
the section index in sh_info to find the section instead of looking it
up by name.

LLD, for example, generates a .rela section that doesn't match the base
section name when we merge sections in a linker script for a binary
compiled with -ffunction-sections.

Signed-off-by: Sami Tolvanen <samitolvanen@google.com>
Signed-off-by: Josh Poimboeuf <jpoimboe@redhat.com>
Reviewed-by: Kees Cook <keescook@chromium.org>
---
 tools/objtool/elf.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/objtool/elf.c b/tools/objtool/elf.c
index f953d3a15612..5bc259c9d892 100644
--- a/tools/objtool/elf.c
+++ b/tools/objtool/elf.c
@@ -508,7 +508,7 @@ static int read_relas(struct elf *elf)
 		if (sec->sh.sh_type != SHT_RELA)
 			continue;
 
-		sec->base = find_section_by_name(elf, sec->name + 5);
+		sec->base = find_section_by_index(elf, sec->sh.sh_info);
 		if (!sec->base) {
 			WARN("can't find base section for rela section %s",
 			     sec->name);

From f1974222634010486c1692e843af0ab11304dd2c Mon Sep 17 00:00:00 2001
From: Matt Helsley <mhelsley@vmware.com>
Date: Fri, 29 May 2020 14:01:13 -0700
Subject: [PATCH 003/502] objtool: Rename rela to reloc

Before supporting additional relocation types rename the relevant
types and functions from "rela" to "reloc". This work be done with
the following regex:

  sed -e 's/struct rela/struct reloc/g' \
      -e 's/\([_\*]\)rela\(s\{0,1\}\)/\1reloc\2/g' \
      -e 's/tmprela\(s\{0,1\}\)/tmpreloc\1/g' \
      -e 's/relasec/relocsec/g' \
      -e 's/rela_list/reloc_list/g' \
      -e 's/rela_hash/reloc_hash/g' \
      -e 's/add_rela/add_reloc/g' \
      -e 's/rela->/reloc->/g' \
      -e '/rela[,\.]/{ s/\([^\.>]\)rela\([\.,]\)/\1reloc\2/g ; }' \
      -e 's/rela =/reloc =/g' \
      -e 's/relas =/relocs =/g' \
      -e 's/relas\[/relocs[/g' \
      -e 's/relaname =/relocname =/g' \
      -e 's/= rela\;/= reloc\;/g' \
      -e 's/= relas\;/= relocs\;/g' \
      -e 's/= relaname\;/= relocname\;/g' \
      -e 's/, rela)/, reloc)/g' \
      -e 's/\([ @]\)rela\([ "]\)/\1reloc\2/g' \
      -e 's/ rela$/ reloc/g' \
      -e 's/, relaname/, relocname/g' \
      -e 's/sec->rela/sec->reloc/g' \
      -e 's/(\(!\{0,1\}\)rela/(\1reloc/g' \
      -i \
      arch.h \
      arch/x86/decode.c  \
      check.c \
      check.h \
      elf.c \
      elf.h \
      orc_gen.c \
      special.c

Notable exceptions which complicate the regex include gelf_*
library calls and standard/expected section names which still use
"rela" because they encode the type of relocation expected. Also, keep
"rela" in the struct because it encodes a specific type of relocation
we currently expect.

It will eventually turn into a member of an anonymous union when a
susequent patch adds implicit addend, or "rel", relocation support.

Signed-off-by: Matt Helsley <mhelsley@vmware.com>
Signed-off-by: Josh Poimboeuf <jpoimboe@redhat.com>
---
 tools/objtool/arch.h            |   2 +-
 tools/objtool/arch/x86/decode.c |   2 +-
 tools/objtool/check.c           | 196 ++++++++++++++++----------------
 tools/objtool/check.h           |   2 +-
 tools/objtool/elf.c             | 138 +++++++++++-----------
 tools/objtool/elf.h             |  22 ++--
 tools/objtool/orc_gen.c         |  46 ++++----
 tools/objtool/special.c         |  28 ++---
 8 files changed, 218 insertions(+), 218 deletions(-)

diff --git a/tools/objtool/arch.h b/tools/objtool/arch.h
index eda15a5a285e..d0969a9328c2 100644
--- a/tools/objtool/arch.h
+++ b/tools/objtool/arch.h
@@ -82,6 +82,6 @@ bool arch_callee_saved_reg(unsigned char reg);
 
 unsigned long arch_jump_destination(struct instruction *insn);
 
-unsigned long arch_dest_rela_offset(int addend);
+unsigned long arch_dest_reloc_offset(int addend);
 
 #endif /* _ARCH_H */
diff --git a/tools/objtool/arch/x86/decode.c b/tools/objtool/arch/x86/decode.c
index 4b504fc90bbb..fe83d4c92825 100644
--- a/tools/objtool/arch/x86/decode.c
+++ b/tools/objtool/arch/x86/decode.c
@@ -67,7 +67,7 @@ bool arch_callee_saved_reg(unsigned char reg)
 	}
 }
 
-unsigned long arch_dest_rela_offset(int addend)
+unsigned long arch_dest_reloc_offset(int addend)
 {
 	return addend + 4;
 }
diff --git a/tools/objtool/check.c b/tools/objtool/check.c
index 63d65a702900..28ce311ea90c 100644
--- a/tools/objtool/check.c
+++ b/tools/objtool/check.c
@@ -352,7 +352,7 @@ static struct instruction *find_last_insn(struct objtool_file *file,
 static int add_dead_ends(struct objtool_file *file)
 {
 	struct section *sec;
-	struct rela *rela;
+	struct reloc *reloc;
 	struct instruction *insn;
 
 	/*
@@ -370,24 +370,24 @@ static int add_dead_ends(struct objtool_file *file)
 	if (!sec)
 		goto reachable;
 
-	list_for_each_entry(rela, &sec->rela_list, list) {
-		if (rela->sym->type != STT_SECTION) {
+	list_for_each_entry(reloc, &sec->reloc_list, list) {
+		if (reloc->sym->type != STT_SECTION) {
 			WARN("unexpected relocation symbol type in %s", sec->name);
 			return -1;
 		}
-		insn = find_insn(file, rela->sym->sec, rela->addend);
+		insn = find_insn(file, reloc->sym->sec, reloc->addend);
 		if (insn)
 			insn = list_prev_entry(insn, list);
-		else if (rela->addend == rela->sym->sec->len) {
-			insn = find_last_insn(file, rela->sym->sec);
+		else if (reloc->addend == reloc->sym->sec->len) {
+			insn = find_last_insn(file, reloc->sym->sec);
 			if (!insn) {
 				WARN("can't find unreachable insn at %s+0x%x",
-				     rela->sym->sec->name, rela->addend);
+				     reloc->sym->sec->name, reloc->addend);
 				return -1;
 			}
 		} else {
 			WARN("can't find unreachable insn at %s+0x%x",
-			     rela->sym->sec->name, rela->addend);
+			     reloc->sym->sec->name, reloc->addend);
 			return -1;
 		}
 
@@ -405,24 +405,24 @@ reachable:
 	if (!sec)
 		return 0;
 
-	list_for_each_entry(rela, &sec->rela_list, list) {
-		if (rela->sym->type != STT_SECTION) {
+	list_for_each_entry(reloc, &sec->reloc_list, list) {
+		if (reloc->sym->type != STT_SECTION) {
 			WARN("unexpected relocation symbol type in %s", sec->name);
 			return -1;
 		}
-		insn = find_insn(file, rela->sym->sec, rela->addend);
+		insn = find_insn(file, reloc->sym->sec, reloc->addend);
 		if (insn)
 			insn = list_prev_entry(insn, list);
-		else if (rela->addend == rela->sym->sec->len) {
-			insn = find_last_insn(file, rela->sym->sec);
+		else if (reloc->addend == reloc->sym->sec->len) {
+			insn = find_last_insn(file, reloc->sym->sec);
 			if (!insn) {
 				WARN("can't find reachable insn at %s+0x%x",
-				     rela->sym->sec->name, rela->addend);
+				     reloc->sym->sec->name, reloc->addend);
 				return -1;
 			}
 		} else {
 			WARN("can't find reachable insn at %s+0x%x",
-			     rela->sym->sec->name, rela->addend);
+			     reloc->sym->sec->name, reloc->addend);
 			return -1;
 		}
 
@@ -440,26 +440,26 @@ static void add_ignores(struct objtool_file *file)
 	struct instruction *insn;
 	struct section *sec;
 	struct symbol *func;
-	struct rela *rela;
+	struct reloc *reloc;
 
 	sec = find_section_by_name(file->elf, ".rela.discard.func_stack_frame_non_standard");
 	if (!sec)
 		return;
 
-	list_for_each_entry(rela, &sec->rela_list, list) {
-		switch (rela->sym->type) {
+	list_for_each_entry(reloc, &sec->reloc_list, list) {
+		switch (reloc->sym->type) {
 		case STT_FUNC:
-			func = rela->sym;
+			func = reloc->sym;
 			break;
 
 		case STT_SECTION:
-			func = find_func_by_offset(rela->sym->sec, rela->addend);
+			func = find_func_by_offset(reloc->sym->sec, reloc->addend);
 			if (!func)
 				continue;
 			break;
 
 		default:
-			WARN("unexpected relocation symbol type in %s: %d", sec->name, rela->sym->type);
+			WARN("unexpected relocation symbol type in %s: %d", sec->name, reloc->sym->type);
 			continue;
 		}
 
@@ -557,20 +557,20 @@ static void add_uaccess_safe(struct objtool_file *file)
 static int add_ignore_alternatives(struct objtool_file *file)
 {
 	struct section *sec;
-	struct rela *rela;
+	struct reloc *reloc;
 	struct instruction *insn;
 
 	sec = find_section_by_name(file->elf, ".rela.discard.ignore_alts");
 	if (!sec)
 		return 0;
 
-	list_for_each_entry(rela, &sec->rela_list, list) {
-		if (rela->sym->type != STT_SECTION) {
+	list_for_each_entry(reloc, &sec->reloc_list, list) {
+		if (reloc->sym->type != STT_SECTION) {
 			WARN("unexpected relocation symbol type in %s", sec->name);
 			return -1;
 		}
 
-		insn = find_insn(file, rela->sym->sec, rela->addend);
+		insn = find_insn(file, reloc->sym->sec, reloc->addend);
 		if (!insn) {
 			WARN("bad .discard.ignore_alts entry");
 			return -1;
@@ -588,7 +588,7 @@ static int add_ignore_alternatives(struct objtool_file *file)
 static int add_jump_destinations(struct objtool_file *file)
 {
 	struct instruction *insn;
-	struct rela *rela;
+	struct reloc *reloc;
 	struct section *dest_sec;
 	unsigned long dest_off;
 
@@ -599,19 +599,19 @@ static int add_jump_destinations(struct objtool_file *file)
 		if (insn->ignore || insn->offset == FAKE_JUMP_OFFSET)
 			continue;
 
-		rela = find_rela_by_dest_range(file->elf, insn->sec,
+		reloc = find_reloc_by_dest_range(file->elf, insn->sec,
 					       insn->offset, insn->len);
-		if (!rela) {
+		if (!reloc) {
 			dest_sec = insn->sec;
 			dest_off = arch_jump_destination(insn);
-		} else if (rela->sym->type == STT_SECTION) {
-			dest_sec = rela->sym->sec;
-			dest_off = arch_dest_rela_offset(rela->addend);
-		} else if (rela->sym->sec->idx) {
-			dest_sec = rela->sym->sec;
-			dest_off = rela->sym->sym.st_value +
-				   arch_dest_rela_offset(rela->addend);
-		} else if (strstr(rela->sym->name, "_indirect_thunk_")) {
+		} else if (reloc->sym->type == STT_SECTION) {
+			dest_sec = reloc->sym->sec;
+			dest_off = arch_dest_reloc_offset(reloc->addend);
+		} else if (reloc->sym->sec->idx) {
+			dest_sec = reloc->sym->sec;
+			dest_off = reloc->sym->sym.st_value +
+				   arch_dest_reloc_offset(reloc->addend);
+		} else if (strstr(reloc->sym->name, "_indirect_thunk_")) {
 			/*
 			 * Retpoline jumps are really dynamic jumps in
 			 * disguise, so convert them accordingly.
@@ -625,7 +625,7 @@ static int add_jump_destinations(struct objtool_file *file)
 			continue;
 		} else {
 			/* external sibling call */
-			insn->call_dest = rela->sym;
+			insn->call_dest = reloc->sym;
 			continue;
 		}
 
@@ -701,15 +701,15 @@ static int add_call_destinations(struct objtool_file *file)
 {
 	struct instruction *insn;
 	unsigned long dest_off;
-	struct rela *rela;
+	struct reloc *reloc;
 
 	for_each_insn(file, insn) {
 		if (insn->type != INSN_CALL)
 			continue;
 
-		rela = find_rela_by_dest_range(file->elf, insn->sec,
+		reloc = find_reloc_by_dest_range(file->elf, insn->sec,
 					       insn->offset, insn->len);
-		if (!rela) {
+		if (!reloc) {
 			dest_off = arch_jump_destination(insn);
 			insn->call_dest = find_func_by_offset(insn->sec, dest_off);
 			if (!insn->call_dest)
@@ -729,19 +729,19 @@ static int add_call_destinations(struct objtool_file *file)
 				return -1;
 			}
 
-		} else if (rela->sym->type == STT_SECTION) {
-			dest_off = arch_dest_rela_offset(rela->addend);
-			insn->call_dest = find_func_by_offset(rela->sym->sec,
+		} else if (reloc->sym->type == STT_SECTION) {
+			dest_off = arch_dest_reloc_offset(reloc->addend);
+			insn->call_dest = find_func_by_offset(reloc->sym->sec,
 							      dest_off);
 			if (!insn->call_dest) {
 				WARN_FUNC("can't find call dest symbol at %s+0x%lx",
 					  insn->sec, insn->offset,
-					  rela->sym->sec->name,
+					  reloc->sym->sec->name,
 					  dest_off);
 				return -1;
 			}
 		} else
-			insn->call_dest = rela->sym;
+			insn->call_dest = reloc->sym;
 
 		/*
 		 * Whatever stack impact regular CALLs have, should be undone
@@ -849,7 +849,7 @@ static int handle_group_alt(struct objtool_file *file,
 		 */
 		if ((insn->offset != special_alt->new_off ||
 		    (insn->type != INSN_CALL && !is_static_jump(insn))) &&
-		    find_rela_by_dest_range(file->elf, insn->sec, insn->offset, insn->len)) {
+		    find_reloc_by_dest_range(file->elf, insn->sec, insn->offset, insn->len)) {
 
 			WARN_FUNC("unsupported relocation in alternatives section",
 				  insn->sec, insn->offset);
@@ -995,34 +995,34 @@ out:
 }
 
 static int add_jump_table(struct objtool_file *file, struct instruction *insn,
-			    struct rela *table)
+			    struct reloc *table)
 {
-	struct rela *rela = table;
+	struct reloc *reloc = table;
 	struct instruction *dest_insn;
 	struct alternative *alt;
 	struct symbol *pfunc = insn->func->pfunc;
 	unsigned int prev_offset = 0;
 
 	/*
-	 * Each @rela is a switch table relocation which points to the target
+	 * Each @reloc is a switch table relocation which points to the target
 	 * instruction.
 	 */
-	list_for_each_entry_from(rela, &table->sec->rela_list, list) {
+	list_for_each_entry_from(reloc, &table->sec->reloc_list, list) {
 
 		/* Check for the end of the table: */
-		if (rela != table && rela->jump_table_start)
+		if (reloc != table && reloc->jump_table_start)
 			break;
 
 		/* Make sure the table entries are consecutive: */
-		if (prev_offset && rela->offset != prev_offset + 8)
+		if (prev_offset && reloc->offset != prev_offset + 8)
 			break;
 
 		/* Detect function pointers from contiguous objects: */
-		if (rela->sym->sec == pfunc->sec &&
-		    rela->addend == pfunc->offset)
+		if (reloc->sym->sec == pfunc->sec &&
+		    reloc->addend == pfunc->offset)
 			break;
 
-		dest_insn = find_insn(file, rela->sym->sec, rela->addend);
+		dest_insn = find_insn(file, reloc->sym->sec, reloc->addend);
 		if (!dest_insn)
 			break;
 
@@ -1038,7 +1038,7 @@ static int add_jump_table(struct objtool_file *file, struct instruction *insn,
 
 		alt->insn = dest_insn;
 		list_add_tail(&alt->list, &insn->alts);
-		prev_offset = rela->offset;
+		prev_offset = reloc->offset;
 	}
 
 	if (!prev_offset) {
@@ -1093,11 +1093,11 @@ static int add_jump_table(struct objtool_file *file, struct instruction *insn,
  *
  *    NOTE: RETPOLINE made it harder still to decode dynamic jumps.
  */
-static struct rela *find_jump_table(struct objtool_file *file,
+static struct reloc *find_jump_table(struct objtool_file *file,
 				      struct symbol *func,
 				      struct instruction *insn)
 {
-	struct rela *text_rela, *table_rela;
+	struct reloc *text_reloc, *table_reloc;
 	struct instruction *dest_insn, *orig_insn = insn;
 	struct section *table_sec;
 	unsigned long table_offset;
@@ -1122,16 +1122,16 @@ static struct rela *find_jump_table(struct objtool_file *file,
 		    break;
 
 		/* look for a relocation which references .rodata */
-		text_rela = find_rela_by_dest_range(file->elf, insn->sec,
+		text_reloc = find_reloc_by_dest_range(file->elf, insn->sec,
 						    insn->offset, insn->len);
-		if (!text_rela || text_rela->sym->type != STT_SECTION ||
-		    !text_rela->sym->sec->rodata)
+		if (!text_reloc || text_reloc->sym->type != STT_SECTION ||
+		    !text_reloc->sym->sec->rodata)
 			continue;
 
-		table_offset = text_rela->addend;
-		table_sec = text_rela->sym->sec;
+		table_offset = text_reloc->addend;
+		table_sec = text_reloc->sym->sec;
 
-		if (text_rela->type == R_X86_64_PC32)
+		if (text_reloc->type == R_X86_64_PC32)
 			table_offset += 4;
 
 		/*
@@ -1148,14 +1148,14 @@ static struct rela *find_jump_table(struct objtool_file *file,
 			continue;
 
 		/*
-		 * Each table entry has a rela associated with it.  The rela
+		 * Each table entry has a reloc associated with it.  The reloc
 		 * should reference text in the same function as the original
 		 * instruction.
 		 */
-		table_rela = find_rela_by_dest(file->elf, table_sec, table_offset);
-		if (!table_rela)
+		table_reloc = find_reloc_by_dest(file->elf, table_sec, table_offset);
+		if (!table_reloc)
 			continue;
-		dest_insn = find_insn(file, table_rela->sym->sec, table_rela->addend);
+		dest_insn = find_insn(file, table_reloc->sym->sec, table_reloc->addend);
 		if (!dest_insn || !dest_insn->func || dest_insn->func->pfunc != func)
 			continue;
 
@@ -1164,10 +1164,10 @@ static struct rela *find_jump_table(struct objtool_file *file,
 		 * indicates a rare GCC quirk/bug which can leave dead code
 		 * behind.
 		 */
-		if (text_rela->type == R_X86_64_PC32)
+		if (text_reloc->type == R_X86_64_PC32)
 			file->ignore_unreachables = true;
 
-		return table_rela;
+		return table_reloc;
 	}
 
 	return NULL;
@@ -1181,7 +1181,7 @@ static void mark_func_jump_tables(struct objtool_file *file,
 				    struct symbol *func)
 {
 	struct instruction *insn, *last = NULL;
-	struct rela *rela;
+	struct reloc *reloc;
 
 	func_for_each_insn(file, func, insn) {
 		if (!last)
@@ -1204,10 +1204,10 @@ static void mark_func_jump_tables(struct objtool_file *file,
 		if (insn->type != INSN_JUMP_DYNAMIC)
 			continue;
 
-		rela = find_jump_table(file, func, insn);
-		if (rela) {
-			rela->jump_table_start = true;
-			insn->jump_table = rela;
+		reloc = find_jump_table(file, func, insn);
+		if (reloc) {
+			reloc->jump_table_start = true;
+			insn->jump_table = reloc;
 		}
 	}
 }
@@ -1261,8 +1261,8 @@ static int add_jump_table_alts(struct objtool_file *file)
 
 static int read_unwind_hints(struct objtool_file *file)
 {
-	struct section *sec, *relasec;
-	struct rela *rela;
+	struct section *sec, *relocsec;
+	struct reloc *reloc;
 	struct unwind_hint *hint;
 	struct instruction *insn;
 	struct cfi_reg *cfa;
@@ -1272,8 +1272,8 @@ static int read_unwind_hints(struct objtool_file *file)
 	if (!sec)
 		return 0;
 
-	relasec = sec->rela;
-	if (!relasec) {
+	relocsec = sec->reloc;
+	if (!relocsec) {
 		WARN("missing .rela.discard.unwind_hints section");
 		return -1;
 	}
@@ -1288,13 +1288,13 @@ static int read_unwind_hints(struct objtool_file *file)
 	for (i = 0; i < sec->len / sizeof(struct unwind_hint); i++) {
 		hint = (struct unwind_hint *)sec->data->d_buf + i;
 
-		rela = find_rela_by_dest(file->elf, sec, i * sizeof(*hint));
-		if (!rela) {
-			WARN("can't find rela for unwind_hints[%d]", i);
+		reloc = find_reloc_by_dest(file->elf, sec, i * sizeof(*hint));
+		if (!reloc) {
+			WARN("can't find reloc for unwind_hints[%d]", i);
 			return -1;
 		}
 
-		insn = find_insn(file, rela->sym->sec, rela->addend);
+		insn = find_insn(file, reloc->sym->sec, reloc->addend);
 		if (!insn) {
 			WARN("can't find insn for unwind_hints[%d]", i);
 			return -1;
@@ -1352,19 +1352,19 @@ static int read_retpoline_hints(struct objtool_file *file)
 {
 	struct section *sec;
 	struct instruction *insn;
-	struct rela *rela;
+	struct reloc *reloc;
 
 	sec = find_section_by_name(file->elf, ".rela.discard.retpoline_safe");
 	if (!sec)
 		return 0;
 
-	list_for_each_entry(rela, &sec->rela_list, list) {
-		if (rela->sym->type != STT_SECTION) {
+	list_for_each_entry(reloc, &sec->reloc_list, list) {
+		if (reloc->sym->type != STT_SECTION) {
 			WARN("unexpected relocation symbol type in %s", sec->name);
 			return -1;
 		}
 
-		insn = find_insn(file, rela->sym->sec, rela->addend);
+		insn = find_insn(file, reloc->sym->sec, reloc->addend);
 		if (!insn) {
 			WARN("bad .discard.retpoline_safe entry");
 			return -1;
@@ -1387,19 +1387,19 @@ static int read_instr_hints(struct objtool_file *file)
 {
 	struct section *sec;
 	struct instruction *insn;
-	struct rela *rela;
+	struct reloc *reloc;
 
 	sec = find_section_by_name(file->elf, ".rela.discard.instr_end");
 	if (!sec)
 		return 0;
 
-	list_for_each_entry(rela, &sec->rela_list, list) {
-		if (rela->sym->type != STT_SECTION) {
+	list_for_each_entry(reloc, &sec->reloc_list, list) {
+		if (reloc->sym->type != STT_SECTION) {
 			WARN("unexpected relocation symbol type in %s", sec->name);
 			return -1;
 		}
 
-		insn = find_insn(file, rela->sym->sec, rela->addend);
+		insn = find_insn(file, reloc->sym->sec, reloc->addend);
 		if (!insn) {
 			WARN("bad .discard.instr_end entry");
 			return -1;
@@ -1412,13 +1412,13 @@ static int read_instr_hints(struct objtool_file *file)
 	if (!sec)
 		return 0;
 
-	list_for_each_entry(rela, &sec->rela_list, list) {
-		if (rela->sym->type != STT_SECTION) {
+	list_for_each_entry(reloc, &sec->reloc_list, list) {
+		if (reloc->sym->type != STT_SECTION) {
 			WARN("unexpected relocation symbol type in %s", sec->name);
 			return -1;
 		}
 
-		insn = find_insn(file, rela->sym->sec, rela->addend);
+		insn = find_insn(file, reloc->sym->sec, reloc->addend);
 		if (!insn) {
 			WARN("bad .discard.instr_begin entry");
 			return -1;
@@ -1434,22 +1434,22 @@ static int read_intra_function_calls(struct objtool_file *file)
 {
 	struct instruction *insn;
 	struct section *sec;
-	struct rela *rela;
+	struct reloc *reloc;
 
 	sec = find_section_by_name(file->elf, ".rela.discard.intra_function_calls");
 	if (!sec)
 		return 0;
 
-	list_for_each_entry(rela, &sec->rela_list, list) {
+	list_for_each_entry(reloc, &sec->reloc_list, list) {
 		unsigned long dest_off;
 
-		if (rela->sym->type != STT_SECTION) {
+		if (reloc->sym->type != STT_SECTION) {
 			WARN("unexpected relocation symbol type in %s",
 			     sec->name);
 			return -1;
 		}
 
-		insn = find_insn(file, rela->sym->sec, rela->addend);
+		insn = find_insn(file, reloc->sym->sec, reloc->addend);
 		if (!insn) {
 			WARN("bad .discard.intra_function_call entry");
 			return -1;
diff --git a/tools/objtool/check.h b/tools/objtool/check.h
index 906b5210f7ca..061aa96e15d3 100644
--- a/tools/objtool/check.h
+++ b/tools/objtool/check.h
@@ -37,7 +37,7 @@ struct instruction {
 	struct symbol *call_dest;
 	struct instruction *jump_dest;
 	struct instruction *first_jump_src;
-	struct rela *jump_table;
+	struct reloc *jump_table;
 	struct list_head alts;
 	struct symbol *func;
 	struct list_head stack_ops;
diff --git a/tools/objtool/elf.c b/tools/objtool/elf.c
index 5bc259c9d892..3160931e858c 100644
--- a/tools/objtool/elf.c
+++ b/tools/objtool/elf.c
@@ -228,26 +228,26 @@ struct symbol *find_symbol_by_name(const struct elf *elf, const char *name)
 	return NULL;
 }
 
-struct rela *find_rela_by_dest_range(const struct elf *elf, struct section *sec,
+struct reloc *find_reloc_by_dest_range(const struct elf *elf, struct section *sec,
 				     unsigned long offset, unsigned int len)
 {
-	struct rela *rela, *r = NULL;
+	struct reloc *reloc, *r = NULL;
 	unsigned long o;
 
-	if (!sec->rela)
+	if (!sec->reloc)
 		return NULL;
 
-	sec = sec->rela;
+	sec = sec->reloc;
 
 	for_offset_range(o, offset, offset + len) {
-		elf_hash_for_each_possible(elf->rela_hash, rela, hash,
+		elf_hash_for_each_possible(elf->reloc_hash, reloc, hash,
 				       sec_offset_hash(sec, o)) {
-			if (rela->sec != sec)
+			if (reloc->sec != sec)
 				continue;
 
-			if (rela->offset >= offset && rela->offset < offset + len) {
-				if (!r || rela->offset < r->offset)
-					r = rela;
+			if (reloc->offset >= offset && reloc->offset < offset + len) {
+				if (!r || reloc->offset < r->offset)
+					r = reloc;
 			}
 		}
 		if (r)
@@ -257,9 +257,9 @@ struct rela *find_rela_by_dest_range(const struct elf *elf, struct section *sec,
 	return NULL;
 }
 
-struct rela *find_rela_by_dest(const struct elf *elf, struct section *sec, unsigned long offset)
+struct reloc *find_reloc_by_dest(const struct elf *elf, struct section *sec, unsigned long offset)
 {
-	return find_rela_by_dest_range(elf, sec, offset, 1);
+	return find_reloc_by_dest_range(elf, sec, offset, 1);
 }
 
 static int read_sections(struct elf *elf)
@@ -288,7 +288,7 @@ static int read_sections(struct elf *elf)
 		memset(sec, 0, sizeof(*sec));
 
 		INIT_LIST_HEAD(&sec->symbol_list);
-		INIT_LIST_HEAD(&sec->rela_list);
+		INIT_LIST_HEAD(&sec->reloc_list);
 
 		s = elf_getscn(elf->elf, i);
 		if (!s) {
@@ -488,21 +488,21 @@ err:
 	return -1;
 }
 
-void elf_add_rela(struct elf *elf, struct rela *rela)
+void elf_add_reloc(struct elf *elf, struct reloc *reloc)
 {
-	struct section *sec = rela->sec;
+	struct section *sec = reloc->sec;
 
-	list_add_tail(&rela->list, &sec->rela_list);
-	elf_hash_add(elf->rela_hash, &rela->hash, rela_hash(rela));
+	list_add_tail(&reloc->list, &sec->reloc_list);
+	elf_hash_add(elf->reloc_hash, &reloc->hash, reloc_hash(reloc));
 }
 
-static int read_relas(struct elf *elf)
+static int read_relocs(struct elf *elf)
 {
 	struct section *sec;
-	struct rela *rela;
+	struct reloc *reloc;
 	int i;
 	unsigned int symndx;
-	unsigned long nr_rela, max_rela = 0, tot_rela = 0;
+	unsigned long nr_reloc, max_reloc = 0, tot_reloc = 0;
 
 	list_for_each_entry(sec, &elf->sections, list) {
 		if (sec->sh.sh_type != SHT_RELA)
@@ -510,49 +510,49 @@ static int read_relas(struct elf *elf)
 
 		sec->base = find_section_by_index(elf, sec->sh.sh_info);
 		if (!sec->base) {
-			WARN("can't find base section for rela section %s",
+			WARN("can't find base section for reloc section %s",
 			     sec->name);
 			return -1;
 		}
 
-		sec->base->rela = sec;
+		sec->base->reloc = sec;
 
-		nr_rela = 0;
+		nr_reloc = 0;
 		for (i = 0; i < sec->sh.sh_size / sec->sh.sh_entsize; i++) {
-			rela = malloc(sizeof(*rela));
-			if (!rela) {
+			reloc = malloc(sizeof(*reloc));
+			if (!reloc) {
 				perror("malloc");
 				return -1;
 			}
-			memset(rela, 0, sizeof(*rela));
+			memset(reloc, 0, sizeof(*reloc));
 
-			if (!gelf_getrela(sec->data, i, &rela->rela)) {
+			if (!gelf_getrela(sec->data, i, &reloc->rela)) {
 				WARN_ELF("gelf_getrela");
 				return -1;
 			}
 
-			rela->type = GELF_R_TYPE(rela->rela.r_info);
-			rela->addend = rela->rela.r_addend;
-			rela->offset = rela->rela.r_offset;
-			symndx = GELF_R_SYM(rela->rela.r_info);
-			rela->sym = find_symbol_by_index(elf, symndx);
-			rela->sec = sec;
-			if (!rela->sym) {
-				WARN("can't find rela entry symbol %d for %s",
+			reloc->type = GELF_R_TYPE(reloc->rela.r_info);
+			reloc->addend = reloc->rela.r_addend;
+			reloc->offset = reloc->rela.r_offset;
+			symndx = GELF_R_SYM(reloc->rela.r_info);
+			reloc->sym = find_symbol_by_index(elf, symndx);
+			reloc->sec = sec;
+			if (!reloc->sym) {
+				WARN("can't find reloc entry symbol %d for %s",
 				     symndx, sec->name);
 				return -1;
 			}
 
-			elf_add_rela(elf, rela);
-			nr_rela++;
+			elf_add_reloc(elf, reloc);
+			nr_reloc++;
 		}
-		max_rela = max(max_rela, nr_rela);
-		tot_rela += nr_rela;
+		max_reloc = max(max_reloc, nr_reloc);
+		tot_reloc += nr_reloc;
 	}
 
 	if (stats) {
-		printf("max_rela: %lu\n", max_rela);
-		printf("tot_rela: %lu\n", tot_rela);
+		printf("max_reloc: %lu\n", max_reloc);
+		printf("tot_reloc: %lu\n", tot_reloc);
 	}
 
 	return 0;
@@ -578,7 +578,7 @@ struct elf *elf_open_read(const char *name, int flags)
 	elf_hash_init(elf->symbol_name_hash);
 	elf_hash_init(elf->section_hash);
 	elf_hash_init(elf->section_name_hash);
-	elf_hash_init(elf->rela_hash);
+	elf_hash_init(elf->reloc_hash);
 
 	elf->fd = open(name, flags);
 	if (elf->fd == -1) {
@@ -611,7 +611,7 @@ struct elf *elf_open_read(const char *name, int flags)
 	if (read_symbols(elf))
 		goto err;
 
-	if (read_relas(elf))
+	if (read_relocs(elf))
 		goto err;
 
 	return elf;
@@ -637,7 +637,7 @@ struct section *elf_create_section(struct elf *elf, const char *name,
 	memset(sec, 0, sizeof(*sec));
 
 	INIT_LIST_HEAD(&sec->symbol_list);
-	INIT_LIST_HEAD(&sec->rela_list);
+	INIT_LIST_HEAD(&sec->reloc_list);
 
 	s = elf_newscn(elf->elf);
 	if (!s) {
@@ -722,25 +722,25 @@ struct section *elf_create_section(struct elf *elf, const char *name,
 	return sec;
 }
 
-struct section *elf_create_rela_section(struct elf *elf, struct section *base)
+struct section *elf_create_reloc_section(struct elf *elf, struct section *base)
 {
-	char *relaname;
+	char *relocname;
 	struct section *sec;
 
-	relaname = malloc(strlen(base->name) + strlen(".rela") + 1);
-	if (!relaname) {
+	relocname = malloc(strlen(base->name) + strlen(".rela") + 1);
+	if (!relocname) {
 		perror("malloc");
 		return NULL;
 	}
-	strcpy(relaname, ".rela");
-	strcat(relaname, base->name);
+	strcpy(relocname, ".rela");
+	strcat(relocname, base->name);
 
-	sec = elf_create_section(elf, relaname, sizeof(GElf_Rela), 0);
-	free(relaname);
+	sec = elf_create_section(elf, relocname, sizeof(GElf_Rela), 0);
+	free(relocname);
 	if (!sec)
 		return NULL;
 
-	base->rela = sec;
+	base->reloc = sec;
 	sec->base = base;
 
 	sec->sh.sh_type = SHT_RELA;
@@ -752,33 +752,33 @@ struct section *elf_create_rela_section(struct elf *elf, struct section *base)
 	return sec;
 }
 
-int elf_rebuild_rela_section(struct section *sec)
+int elf_rebuild_reloc_section(struct section *sec)
 {
-	struct rela *rela;
+	struct reloc *reloc;
 	int nr, idx = 0, size;
-	GElf_Rela *relas;
+	GElf_Rela *relocs;
 
 	nr = 0;
-	list_for_each_entry(rela, &sec->rela_list, list)
+	list_for_each_entry(reloc, &sec->reloc_list, list)
 		nr++;
 
-	size = nr * sizeof(*relas);
-	relas = malloc(size);
-	if (!relas) {
+	size = nr * sizeof(*relocs);
+	relocs = malloc(size);
+	if (!relocs) {
 		perror("malloc");
 		return -1;
 	}
 
-	sec->data->d_buf = relas;
+	sec->data->d_buf = relocs;
 	sec->data->d_size = size;
 
 	sec->sh.sh_size = size;
 
 	idx = 0;
-	list_for_each_entry(rela, &sec->rela_list, list) {
-		relas[idx].r_offset = rela->offset;
-		relas[idx].r_addend = rela->addend;
-		relas[idx].r_info = GELF_R_INFO(rela->sym->idx, rela->type);
+	list_for_each_entry(reloc, &sec->reloc_list, list) {
+		relocs[idx].r_offset = reloc->offset;
+		relocs[idx].r_addend = reloc->addend;
+		relocs[idx].r_info = GELF_R_INFO(reloc->sym->idx, reloc->type);
 		idx++;
 	}
 
@@ -821,7 +821,7 @@ void elf_close(struct elf *elf)
 {
 	struct section *sec, *tmpsec;
 	struct symbol *sym, *tmpsym;
-	struct rela *rela, *tmprela;
+	struct reloc *reloc, *tmpreloc;
 
 	if (elf->elf)
 		elf_end(elf->elf);
@@ -835,10 +835,10 @@ void elf_close(struct elf *elf)
 			hash_del(&sym->hash);
 			free(sym);
 		}
-		list_for_each_entry_safe(rela, tmprela, &sec->rela_list, list) {
-			list_del(&rela->list);
-			hash_del(&rela->hash);
-			free(rela);
+		list_for_each_entry_safe(reloc, tmpreloc, &sec->reloc_list, list) {
+			list_del(&reloc->list);
+			hash_del(&reloc->hash);
+			free(reloc);
 		}
 		list_del(&sec->list);
 		free(sec);
diff --git a/tools/objtool/elf.h b/tools/objtool/elf.h
index f4fe1d6ea392..6ad759fd778e 100644
--- a/tools/objtool/elf.h
+++ b/tools/objtool/elf.h
@@ -32,8 +32,8 @@ struct section {
 	GElf_Shdr sh;
 	struct rb_root symbol_tree;
 	struct list_head symbol_list;
-	struct list_head rela_list;
-	struct section *base, *rela;
+	struct list_head reloc_list;
+	struct section *base, *reloc;
 	struct symbol *sym;
 	Elf_Data *data;
 	char *name;
@@ -58,7 +58,7 @@ struct symbol {
 	bool uaccess_safe;
 };
 
-struct rela {
+struct reloc {
 	struct list_head list;
 	struct hlist_node hash;
 	GElf_Rela rela;
@@ -82,7 +82,7 @@ struct elf {
 	DECLARE_HASHTABLE(symbol_name_hash, ELF_HASH_BITS);
 	DECLARE_HASHTABLE(section_hash, ELF_HASH_BITS);
 	DECLARE_HASHTABLE(section_name_hash, ELF_HASH_BITS);
-	DECLARE_HASHTABLE(rela_hash, ELF_HASH_BITS);
+	DECLARE_HASHTABLE(reloc_hash, ELF_HASH_BITS);
 };
 
 #define OFFSET_STRIDE_BITS	4
@@ -109,15 +109,15 @@ static inline u32 sec_offset_hash(struct section *sec, unsigned long offset)
 	return ol;
 }
 
-static inline u32 rela_hash(struct rela *rela)
+static inline u32 reloc_hash(struct reloc *reloc)
 {
-	return sec_offset_hash(rela->sec, rela->offset);
+	return sec_offset_hash(reloc->sec, reloc->offset);
 }
 
 struct elf *elf_open_read(const char *name, int flags);
 struct section *elf_create_section(struct elf *elf, const char *name, size_t entsize, int nr);
-struct section *elf_create_rela_section(struct elf *elf, struct section *base);
-void elf_add_rela(struct elf *elf, struct rela *rela);
+struct section *elf_create_reloc_section(struct elf *elf, struct section *base);
+void elf_add_reloc(struct elf *elf, struct reloc *reloc);
 int elf_write(const struct elf *elf);
 void elf_close(struct elf *elf);
 
@@ -126,11 +126,11 @@ struct symbol *find_func_by_offset(struct section *sec, unsigned long offset);
 struct symbol *find_symbol_by_offset(struct section *sec, unsigned long offset);
 struct symbol *find_symbol_by_name(const struct elf *elf, const char *name);
 struct symbol *find_symbol_containing(const struct section *sec, unsigned long offset);
-struct rela *find_rela_by_dest(const struct elf *elf, struct section *sec, unsigned long offset);
-struct rela *find_rela_by_dest_range(const struct elf *elf, struct section *sec,
+struct reloc *find_reloc_by_dest(const struct elf *elf, struct section *sec, unsigned long offset);
+struct reloc *find_reloc_by_dest_range(const struct elf *elf, struct section *sec,
 				     unsigned long offset, unsigned int len);
 struct symbol *find_func_containing(struct section *sec, unsigned long offset);
-int elf_rebuild_rela_section(struct section *sec);
+int elf_rebuild_reloc_section(struct section *sec);
 
 #define for_each_sec(file, sec)						\
 	list_for_each_entry(sec, &file->elf->sections, list)
diff --git a/tools/objtool/orc_gen.c b/tools/objtool/orc_gen.c
index c9549988121a..93c720baea66 100644
--- a/tools/objtool/orc_gen.c
+++ b/tools/objtool/orc_gen.c
@@ -80,56 +80,56 @@ int create_orc(struct objtool_file *file)
 	return 0;
 }
 
-static int create_orc_entry(struct elf *elf, struct section *u_sec, struct section *ip_relasec,
+static int create_orc_entry(struct elf *elf, struct section *u_sec, struct section *ip_relocsec,
 				unsigned int idx, struct section *insn_sec,
 				unsigned long insn_off, struct orc_entry *o)
 {
 	struct orc_entry *orc;
-	struct rela *rela;
+	struct reloc *reloc;
 
 	/* populate ORC data */
 	orc = (struct orc_entry *)u_sec->data->d_buf + idx;
 	memcpy(orc, o, sizeof(*orc));
 
-	/* populate rela for ip */
-	rela = malloc(sizeof(*rela));
-	if (!rela) {
+	/* populate reloc for ip */
+	reloc = malloc(sizeof(*reloc));
+	if (!reloc) {
 		perror("malloc");
 		return -1;
 	}
-	memset(rela, 0, sizeof(*rela));
+	memset(reloc, 0, sizeof(*reloc));
 
 	if (insn_sec->sym) {
-		rela->sym = insn_sec->sym;
-		rela->addend = insn_off;
+		reloc->sym = insn_sec->sym;
+		reloc->addend = insn_off;
 	} else {
 		/*
 		 * The Clang assembler doesn't produce section symbols, so we
 		 * have to reference the function symbol instead:
 		 */
-		rela->sym = find_symbol_containing(insn_sec, insn_off);
-		if (!rela->sym) {
+		reloc->sym = find_symbol_containing(insn_sec, insn_off);
+		if (!reloc->sym) {
 			/*
 			 * Hack alert.  This happens when we need to reference
 			 * the NOP pad insn immediately after the function.
 			 */
-			rela->sym = find_symbol_containing(insn_sec,
+			reloc->sym = find_symbol_containing(insn_sec,
 							   insn_off - 1);
 		}
-		if (!rela->sym) {
+		if (!reloc->sym) {
 			WARN("missing symbol for insn at offset 0x%lx\n",
 			     insn_off);
 			return -1;
 		}
 
-		rela->addend = insn_off - rela->sym->offset;
+		reloc->addend = insn_off - reloc->sym->offset;
 	}
 
-	rela->type = R_X86_64_PC32;
-	rela->offset = idx * sizeof(int);
-	rela->sec = ip_relasec;
+	reloc->type = R_X86_64_PC32;
+	reloc->offset = idx * sizeof(int);
+	reloc->sec = ip_relocsec;
 
-	elf_add_rela(elf, rela);
+	elf_add_reloc(elf, reloc);
 
 	return 0;
 }
@@ -137,7 +137,7 @@ static int create_orc_entry(struct elf *elf, struct section *u_sec, struct secti
 int create_orc_sections(struct objtool_file *file)
 {
 	struct instruction *insn, *prev_insn;
-	struct section *sec, *u_sec, *ip_relasec;
+	struct section *sec, *u_sec, *ip_relocsec;
 	unsigned int idx;
 
 	struct orc_entry empty = {
@@ -181,8 +181,8 @@ int create_orc_sections(struct objtool_file *file)
 	if (!sec)
 		return -1;
 
-	ip_relasec = elf_create_rela_section(file->elf, sec);
-	if (!ip_relasec)
+	ip_relocsec = elf_create_reloc_section(file->elf, sec);
+	if (!ip_relocsec)
 		return -1;
 
 	/* create .orc_unwind section */
@@ -200,7 +200,7 @@ int create_orc_sections(struct objtool_file *file)
 			if (!prev_insn || memcmp(&insn->orc, &prev_insn->orc,
 						 sizeof(struct orc_entry))) {
 
-				if (create_orc_entry(file->elf, u_sec, ip_relasec, idx,
+				if (create_orc_entry(file->elf, u_sec, ip_relocsec, idx,
 						     insn->sec, insn->offset,
 						     &insn->orc))
 					return -1;
@@ -212,7 +212,7 @@ int create_orc_sections(struct objtool_file *file)
 
 		/* section terminator */
 		if (prev_insn) {
-			if (create_orc_entry(file->elf, u_sec, ip_relasec, idx,
+			if (create_orc_entry(file->elf, u_sec, ip_relocsec, idx,
 					     prev_insn->sec,
 					     prev_insn->offset + prev_insn->len,
 					     &empty))
@@ -222,7 +222,7 @@ int create_orc_sections(struct objtool_file *file)
 		}
 	}
 
-	if (elf_rebuild_rela_section(ip_relasec))
+	if (elf_rebuild_reloc_section(ip_relocsec))
 		return -1;
 
 	return 0;
diff --git a/tools/objtool/special.c b/tools/objtool/special.c
index e74e0189de22..e893f1e48e44 100644
--- a/tools/objtool/special.c
+++ b/tools/objtool/special.c
@@ -72,7 +72,7 @@ static int get_alt_entry(struct elf *elf, struct special_entry *entry,
 			 struct section *sec, int idx,
 			 struct special_alt *alt)
 {
-	struct rela *orig_rela, *new_rela;
+	struct reloc *orig_reloc, *new_reloc;
 	unsigned long offset;
 
 	offset = idx * entry->size;
@@ -118,30 +118,30 @@ static int get_alt_entry(struct elf *elf, struct special_entry *entry,
 		}
 	}
 
-	orig_rela = find_rela_by_dest(elf, sec, offset + entry->orig);
-	if (!orig_rela) {
-		WARN_FUNC("can't find orig rela", sec, offset + entry->orig);
+	orig_reloc = find_reloc_by_dest(elf, sec, offset + entry->orig);
+	if (!orig_reloc) {
+		WARN_FUNC("can't find orig reloc", sec, offset + entry->orig);
 		return -1;
 	}
-	if (orig_rela->sym->type != STT_SECTION) {
-		WARN_FUNC("don't know how to handle non-section rela symbol %s",
-			   sec, offset + entry->orig, orig_rela->sym->name);
+	if (orig_reloc->sym->type != STT_SECTION) {
+		WARN_FUNC("don't know how to handle non-section reloc symbol %s",
+			   sec, offset + entry->orig, orig_reloc->sym->name);
 		return -1;
 	}
 
-	alt->orig_sec = orig_rela->sym->sec;
-	alt->orig_off = orig_rela->addend;
+	alt->orig_sec = orig_reloc->sym->sec;
+	alt->orig_off = orig_reloc->addend;
 
 	if (!entry->group || alt->new_len) {
-		new_rela = find_rela_by_dest(elf, sec, offset + entry->new);
-		if (!new_rela) {
-			WARN_FUNC("can't find new rela",
+		new_reloc = find_reloc_by_dest(elf, sec, offset + entry->new);
+		if (!new_reloc) {
+			WARN_FUNC("can't find new reloc",
 				  sec, offset + entry->new);
 			return -1;
 		}
 
-		alt->new_sec = new_rela->sym->sec;
-		alt->new_off = (unsigned int)new_rela->addend;
+		alt->new_sec = new_reloc->sym->sec;
+		alt->new_off = (unsigned int)new_reloc->addend;
 
 		/* _ASM_EXTABLE_EX hack */
 		if (alt->new_off >= 0x7ffffff0)

From fb414783b65c880606fbc1463e6849f017e60d46 Mon Sep 17 00:00:00 2001
From: Matt Helsley <mhelsley@vmware.com>
Date: Fri, 29 May 2020 14:01:14 -0700
Subject: [PATCH 004/502] objtool: Add support for relocations without addends

Currently objtool only collects information about relocations with
addends. In recordmcount, which we are about to merge into objtool,
some supported architectures do not use rela relocations.

Signed-off-by: Matt Helsley <mhelsley@vmware.com>
Reviewed-by: Julien Thierry <jthierry@redhat.com>
Reviewed-by: Kamalesh Babulal <kamalesh@linux.vnet.ibm.com>
Signed-off-by: Josh Poimboeuf <jpoimboe@redhat.com>
---
 tools/objtool/elf.c     | 145 +++++++++++++++++++++++++++++++++++-----
 tools/objtool/elf.h     |   7 +-
 tools/objtool/orc_gen.c |   2 +-
 3 files changed, 134 insertions(+), 20 deletions(-)

diff --git a/tools/objtool/elf.c b/tools/objtool/elf.c
index 3160931e858c..95d86bcb9512 100644
--- a/tools/objtool/elf.c
+++ b/tools/objtool/elf.c
@@ -496,6 +496,32 @@ void elf_add_reloc(struct elf *elf, struct reloc *reloc)
 	elf_hash_add(elf->reloc_hash, &reloc->hash, reloc_hash(reloc));
 }
 
+static int read_rel_reloc(struct section *sec, int i, struct reloc *reloc, unsigned int *symndx)
+{
+	if (!gelf_getrel(sec->data, i, &reloc->rel)) {
+		WARN_ELF("gelf_getrel");
+		return -1;
+	}
+	reloc->type = GELF_R_TYPE(reloc->rel.r_info);
+	reloc->addend = 0;
+	reloc->offset = reloc->rel.r_offset;
+	*symndx = GELF_R_SYM(reloc->rel.r_info);
+	return 0;
+}
+
+static int read_rela_reloc(struct section *sec, int i, struct reloc *reloc, unsigned int *symndx)
+{
+	if (!gelf_getrela(sec->data, i, &reloc->rela)) {
+		WARN_ELF("gelf_getrela");
+		return -1;
+	}
+	reloc->type = GELF_R_TYPE(reloc->rela.r_info);
+	reloc->addend = reloc->rela.r_addend;
+	reloc->offset = reloc->rela.r_offset;
+	*symndx = GELF_R_SYM(reloc->rela.r_info);
+	return 0;
+}
+
 static int read_relocs(struct elf *elf)
 {
 	struct section *sec;
@@ -505,7 +531,8 @@ static int read_relocs(struct elf *elf)
 	unsigned long nr_reloc, max_reloc = 0, tot_reloc = 0;
 
 	list_for_each_entry(sec, &elf->sections, list) {
-		if (sec->sh.sh_type != SHT_RELA)
+		if ((sec->sh.sh_type != SHT_RELA) &&
+		    (sec->sh.sh_type != SHT_REL))
 			continue;
 
 		sec->base = find_section_by_index(elf, sec->sh.sh_info);
@@ -525,16 +552,17 @@ static int read_relocs(struct elf *elf)
 				return -1;
 			}
 			memset(reloc, 0, sizeof(*reloc));
-
-			if (!gelf_getrela(sec->data, i, &reloc->rela)) {
-				WARN_ELF("gelf_getrela");
-				return -1;
+			switch (sec->sh.sh_type) {
+			case SHT_REL:
+				if (read_rel_reloc(sec, i, reloc, &symndx))
+					return -1;
+				break;
+			case SHT_RELA:
+				if (read_rela_reloc(sec, i, reloc, &symndx))
+					return -1;
+				break;
+			default: return -1;
 			}
-
-			reloc->type = GELF_R_TYPE(reloc->rela.r_info);
-			reloc->addend = reloc->rela.r_addend;
-			reloc->offset = reloc->rela.r_offset;
-			symndx = GELF_R_SYM(reloc->rela.r_info);
 			reloc->sym = find_symbol_by_index(elf, symndx);
 			reloc->sec = sec;
 			if (!reloc->sym) {
@@ -722,7 +750,37 @@ struct section *elf_create_section(struct elf *elf, const char *name,
 	return sec;
 }
 
-struct section *elf_create_reloc_section(struct elf *elf, struct section *base)
+static struct section *elf_create_rel_reloc_section(struct elf *elf, struct section *base)
+{
+	char *relocname;
+	struct section *sec;
+
+	relocname = malloc(strlen(base->name) + strlen(".rel") + 1);
+	if (!relocname) {
+		perror("malloc");
+		return NULL;
+	}
+	strcpy(relocname, ".rel");
+	strcat(relocname, base->name);
+
+	sec = elf_create_section(elf, relocname, sizeof(GElf_Rel), 0);
+	free(relocname);
+	if (!sec)
+		return NULL;
+
+	base->reloc = sec;
+	sec->base = base;
+
+	sec->sh.sh_type = SHT_REL;
+	sec->sh.sh_addralign = 8;
+	sec->sh.sh_link = find_section_by_name(elf, ".symtab")->idx;
+	sec->sh.sh_info = base->idx;
+	sec->sh.sh_flags = SHF_INFO_LINK;
+
+	return sec;
+}
+
+static struct section *elf_create_rela_reloc_section(struct elf *elf, struct section *base)
 {
 	char *relocname;
 	struct section *sec;
@@ -752,16 +810,53 @@ struct section *elf_create_reloc_section(struct elf *elf, struct section *base)
 	return sec;
 }
 
-int elf_rebuild_reloc_section(struct section *sec)
+struct section *elf_create_reloc_section(struct elf *elf,
+					 struct section *base,
+					 int reltype)
+{
+	switch (reltype) {
+	case SHT_REL:  return elf_create_rel_reloc_section(elf, base);
+	case SHT_RELA: return elf_create_rela_reloc_section(elf, base);
+	default:       return NULL;
+	}
+}
+
+static int elf_rebuild_rel_reloc_section(struct section *sec, int nr)
 {
 	struct reloc *reloc;
-	int nr, idx = 0, size;
+	int idx = 0, size;
+	GElf_Rel *relocs;
+
+	/* Allocate a buffer for relocations */
+	size = nr * sizeof(*relocs);
+	relocs = malloc(size);
+	if (!relocs) {
+		perror("malloc");
+		return -1;
+	}
+
+	sec->data->d_buf = relocs;
+	sec->data->d_size = size;
+
+	sec->sh.sh_size = size;
+
+	idx = 0;
+	list_for_each_entry(reloc, &sec->reloc_list, list) {
+		relocs[idx].r_offset = reloc->offset;
+		relocs[idx].r_info = GELF_R_INFO(reloc->sym->idx, reloc->type);
+		idx++;
+	}
+
+	return 0;
+}
+
+static int elf_rebuild_rela_reloc_section(struct section *sec, int nr)
+{
+	struct reloc *reloc;
+	int idx = 0, size;
 	GElf_Rela *relocs;
 
-	nr = 0;
-	list_for_each_entry(reloc, &sec->reloc_list, list)
-		nr++;
-
+	/* Allocate a buffer for relocations with addends */
 	size = nr * sizeof(*relocs);
 	relocs = malloc(size);
 	if (!relocs) {
@@ -785,6 +880,22 @@ int elf_rebuild_reloc_section(struct section *sec)
 	return 0;
 }
 
+int elf_rebuild_reloc_section(struct section *sec)
+{
+	struct reloc *reloc;
+	int nr;
+
+	nr = 0;
+	list_for_each_entry(reloc, &sec->reloc_list, list)
+		nr++;
+
+	switch (sec->sh.sh_type) {
+	case SHT_REL:  return elf_rebuild_rel_reloc_section(sec, nr);
+	case SHT_RELA: return elf_rebuild_rela_reloc_section(sec, nr);
+	default:       return -1;
+	}
+}
+
 int elf_write(const struct elf *elf)
 {
 	struct section *sec;
diff --git a/tools/objtool/elf.h b/tools/objtool/elf.h
index 6ad759fd778e..78a2db23b8b6 100644
--- a/tools/objtool/elf.h
+++ b/tools/objtool/elf.h
@@ -61,7 +61,10 @@ struct symbol {
 struct reloc {
 	struct list_head list;
 	struct hlist_node hash;
-	GElf_Rela rela;
+	union {
+		GElf_Rela rela;
+		GElf_Rel  rel;
+	};
 	struct section *sec;
 	struct symbol *sym;
 	unsigned int type;
@@ -116,7 +119,7 @@ static inline u32 reloc_hash(struct reloc *reloc)
 
 struct elf *elf_open_read(const char *name, int flags);
 struct section *elf_create_section(struct elf *elf, const char *name, size_t entsize, int nr);
-struct section *elf_create_reloc_section(struct elf *elf, struct section *base);
+struct section *elf_create_reloc_section(struct elf *elf, struct section *base, int reltype);
 void elf_add_reloc(struct elf *elf, struct reloc *reloc);
 int elf_write(const struct elf *elf);
 void elf_close(struct elf *elf);
diff --git a/tools/objtool/orc_gen.c b/tools/objtool/orc_gen.c
index 93c720baea66..75e08cf0709b 100644
--- a/tools/objtool/orc_gen.c
+++ b/tools/objtool/orc_gen.c
@@ -181,7 +181,7 @@ int create_orc_sections(struct objtool_file *file)
 	if (!sec)
 		return -1;
 
-	ip_relocsec = elf_create_reloc_section(file->elf, sec);
+	ip_relocsec = elf_create_reloc_section(file->elf, sec, SHT_RELA);
 	if (!ip_relocsec)
 		return -1;
 

From bb85429a9bf2e7d370b8e1afd72f933a88f0629f Mon Sep 17 00:00:00 2001
From: Kan Liang <kan.liang@linux.intel.com>
Date: Tue, 19 May 2020 12:18:25 -0700
Subject: [PATCH 005/502] perf/x86/intel/uncore: Add Comet Lake support

The uncore subsystem on Comet Lake is similar to Sky Lake.
The only difference is the new PCI IDs for IMC.

Share the perf code with Sky Lake.
Add new PCI IDs in the table.

Signed-off-by: Kan Liang <kan.liang@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/1589915905-55870-1-git-send-email-kan.liang@linux.intel.com
---
 arch/x86/events/intel/uncore.c     |  2 +
 arch/x86/events/intel/uncore_snb.c | 66 ++++++++++++++++++++++++++++++
 2 files changed, 68 insertions(+)

diff --git a/arch/x86/events/intel/uncore.c b/arch/x86/events/intel/uncore.c
index cf76d6631afa..b9c28765bf33 100644
--- a/arch/x86/events/intel/uncore.c
+++ b/arch/x86/events/intel/uncore.c
@@ -1514,6 +1514,8 @@ static const struct x86_cpu_id intel_uncore_match[] __initconst = {
 	X86_MATCH_INTEL_FAM6_MODEL(SKYLAKE_X,		&skx_uncore_init),
 	X86_MATCH_INTEL_FAM6_MODEL(KABYLAKE_L,		&skl_uncore_init),
 	X86_MATCH_INTEL_FAM6_MODEL(KABYLAKE,		&skl_uncore_init),
+	X86_MATCH_INTEL_FAM6_MODEL(COMETLAKE_L,		&skl_uncore_init),
+	X86_MATCH_INTEL_FAM6_MODEL(COMETLAKE,		&skl_uncore_init),
 	X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_L,		&icl_uncore_init),
 	X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_NNPI,	&icl_uncore_init),
 	X86_MATCH_INTEL_FAM6_MODEL(ICELAKE,		&icl_uncore_init),
diff --git a/arch/x86/events/intel/uncore_snb.c b/arch/x86/events/intel/uncore_snb.c
index 3de1065eefc4..5c4036710b7a 100644
--- a/arch/x86/events/intel/uncore_snb.c
+++ b/arch/x86/events/intel/uncore_snb.c
@@ -42,6 +42,17 @@
 #define PCI_DEVICE_ID_INTEL_WHL_UQ_IMC		0x3ed0
 #define PCI_DEVICE_ID_INTEL_WHL_4_UQ_IMC	0x3e34
 #define PCI_DEVICE_ID_INTEL_WHL_UD_IMC		0x3e35
+#define PCI_DEVICE_ID_INTEL_CML_H1_IMC		0x9b44
+#define PCI_DEVICE_ID_INTEL_CML_H2_IMC		0x9b54
+#define PCI_DEVICE_ID_INTEL_CML_H3_IMC		0x9b64
+#define PCI_DEVICE_ID_INTEL_CML_U1_IMC		0x9b51
+#define PCI_DEVICE_ID_INTEL_CML_U2_IMC		0x9b61
+#define PCI_DEVICE_ID_INTEL_CML_U3_IMC		0x9b71
+#define PCI_DEVICE_ID_INTEL_CML_S1_IMC		0x9b33
+#define PCI_DEVICE_ID_INTEL_CML_S2_IMC		0x9b43
+#define PCI_DEVICE_ID_INTEL_CML_S3_IMC		0x9b53
+#define PCI_DEVICE_ID_INTEL_CML_S4_IMC		0x9b63
+#define PCI_DEVICE_ID_INTEL_CML_S5_IMC		0x9b73
 #define PCI_DEVICE_ID_INTEL_ICL_U_IMC		0x8a02
 #define PCI_DEVICE_ID_INTEL_ICL_U2_IMC		0x8a12
 #define PCI_DEVICE_ID_INTEL_TGL_U1_IMC		0x9a02
@@ -771,6 +782,50 @@ static const struct pci_device_id skl_uncore_pci_ids[] = {
 		PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_WHL_UD_IMC),
 		.driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
 	},
+	{ /* IMC */
+		PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_CML_H1_IMC),
+		.driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
+	},
+	{ /* IMC */
+		PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_CML_H2_IMC),
+		.driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
+	},
+	{ /* IMC */
+		PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_CML_H3_IMC),
+		.driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
+	},
+	{ /* IMC */
+		PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_CML_U1_IMC),
+		.driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
+	},
+	{ /* IMC */
+		PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_CML_U2_IMC),
+		.driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
+	},
+	{ /* IMC */
+		PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_CML_U3_IMC),
+		.driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
+	},
+	{ /* IMC */
+		PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_CML_S1_IMC),
+		.driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
+	},
+	{ /* IMC */
+		PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_CML_S2_IMC),
+		.driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
+	},
+	{ /* IMC */
+		PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_CML_S3_IMC),
+		.driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
+	},
+	{ /* IMC */
+		PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_CML_S4_IMC),
+		.driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
+	},
+	{ /* IMC */
+		PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_CML_S5_IMC),
+		.driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
+	},
 	{ /* end: all zeroes */ },
 };
 
@@ -863,6 +918,17 @@ static const struct imc_uncore_pci_dev desktop_imc_pci_ids[] = {
 	IMC_DEV(WHL_UQ_IMC, &skl_uncore_pci_driver),	/* 8th Gen Core U Mobile Quad Core */
 	IMC_DEV(WHL_4_UQ_IMC, &skl_uncore_pci_driver),	/* 8th Gen Core U Mobile Quad Core */
 	IMC_DEV(WHL_UD_IMC, &skl_uncore_pci_driver),	/* 8th Gen Core U Mobile Dual Core */
+	IMC_DEV(CML_H1_IMC, &skl_uncore_pci_driver),
+	IMC_DEV(CML_H2_IMC, &skl_uncore_pci_driver),
+	IMC_DEV(CML_H3_IMC, &skl_uncore_pci_driver),
+	IMC_DEV(CML_U1_IMC, &skl_uncore_pci_driver),
+	IMC_DEV(CML_U2_IMC, &skl_uncore_pci_driver),
+	IMC_DEV(CML_U3_IMC, &skl_uncore_pci_driver),
+	IMC_DEV(CML_S1_IMC, &skl_uncore_pci_driver),
+	IMC_DEV(CML_S2_IMC, &skl_uncore_pci_driver),
+	IMC_DEV(CML_S3_IMC, &skl_uncore_pci_driver),
+	IMC_DEV(CML_S4_IMC, &skl_uncore_pci_driver),
+	IMC_DEV(CML_S5_IMC, &skl_uncore_pci_driver),
 	IMC_DEV(ICL_U_IMC, &icl_uncore_pci_driver),	/* 10th Gen Core Mobile */
 	IMC_DEV(ICL_U2_IMC, &icl_uncore_pci_driver),	/* 10th Gen Core Mobile */
 	{  /* end marker */ }

From e17d43b93e544f5016c0251d2074c15568d5d963 Mon Sep 17 00:00:00 2001
From: Adrian Hunter <adrian.hunter@intel.com>
Date: Tue, 12 May 2020 15:19:08 +0300
Subject: [PATCH 006/502] perf: Add perf text poke event

Record (single instruction) changes to the kernel text (i.e.
self-modifying code) in order to support tracers like Intel PT and
ARM CoreSight.

A copy of the running kernel code is needed as a reference point (e.g.
from /proc/kcore). The text poke event records the old bytes and the
new bytes so that the event can be processed forwards or backwards.

The basic problem is recording the modified instruction in an
unambiguous manner given SMP instruction cache (in)coherence. That is,
when modifying an instruction concurrently any solution with one or
multiple timestamps is not sufficient:

	CPU0				CPU1
 0
 1	write insn A
 2					execute insn A
 3	sync-I$
 4

Due to I$, CPU1 might execute either the old or new A. No matter where
we record tracepoints on CPU0, one simply cannot tell what CPU1 will
have observed, except that at 0 it must be the old one and at 4 it
must be the new one.

To solve this, take inspiration from x86 text poking, which has to
solve this exact problem due to variable length instruction encoding
and I-fetch windows.

 1) overwrite the instruction with a breakpoint and sync I$

This guarantees that that code flow will never hit the target
instruction anymore, on any CPU (or rather, it will cause an
exception).

 2) issue the TEXT_POKE event

 3) overwrite the breakpoint with the new instruction and sync I$

Now we know that any execution after the TEXT_POKE event will either
observe the breakpoint (and hit the exception) or the new instruction.

So by guarding the TEXT_POKE event with an exception on either side;
we can now tell, without doubt, which instruction another CPU will
have observed.

Signed-off-by: Adrian Hunter <adrian.hunter@intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/20200512121922.8997-2-adrian.hunter@intel.com
---
 include/linux/perf_event.h      |  8 +++
 include/uapi/linux/perf_event.h | 21 +++++++-
 kernel/events/core.c            | 90 ++++++++++++++++++++++++++++++++-
 3 files changed, 117 insertions(+), 2 deletions(-)

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index b4bb32082342..46fe5cfb5163 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -1232,6 +1232,9 @@ extern void perf_event_exec(void);
 extern void perf_event_comm(struct task_struct *tsk, bool exec);
 extern void perf_event_namespaces(struct task_struct *tsk);
 extern void perf_event_fork(struct task_struct *tsk);
+extern void perf_event_text_poke(const void *addr,
+				 const void *old_bytes, size_t old_len,
+				 const void *new_bytes, size_t new_len);
 
 /* Callchains */
 DECLARE_PER_CPU(struct perf_callchain_entry, perf_callchain_entry);
@@ -1479,6 +1482,11 @@ static inline void perf_event_exec(void)				{ }
 static inline void perf_event_comm(struct task_struct *tsk, bool exec)	{ }
 static inline void perf_event_namespaces(struct task_struct *tsk)	{ }
 static inline void perf_event_fork(struct task_struct *tsk)		{ }
+static inline void perf_event_text_poke(const void *addr,
+					const void *old_bytes,
+					size_t old_len,
+					const void *new_bytes,
+					size_t new_len)			{ }
 static inline void perf_event_init(void)				{ }
 static inline int  perf_swevent_get_recursion_context(void)		{ return -1; }
 static inline void perf_swevent_put_recursion_context(int rctx)		{ }
diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h
index 7b2d6fc9e6ed..e5bee6c17b86 100644
--- a/include/uapi/linux/perf_event.h
+++ b/include/uapi/linux/perf_event.h
@@ -383,7 +383,8 @@ struct perf_event_attr {
 				bpf_event      :  1, /* include bpf events */
 				aux_output     :  1, /* generate AUX records instead of events */
 				cgroup         :  1, /* include cgroup events */
-				__reserved_1   : 31;
+				text_poke      :  1, /* include text poke events */
+				__reserved_1   : 30;
 
 	union {
 		__u32		wakeup_events;	  /* wakeup every n events */
@@ -1024,6 +1025,24 @@ enum perf_event_type {
 	 */
 	PERF_RECORD_CGROUP			= 19,
 
+	/*
+	 * Records changes to kernel text i.e. self-modified code. 'old_len' is
+	 * the number of old bytes, 'new_len' is the number of new bytes. Either
+	 * 'old_len' or 'new_len' may be zero to indicate, for example, the
+	 * addition or removal of a trampoline. 'bytes' contains the old bytes
+	 * followed immediately by the new bytes.
+	 *
+	 * struct {
+	 *	struct perf_event_header	header;
+	 *	u64				addr;
+	 *	u16				old_len;
+	 *	u16				new_len;
+	 *	u8				bytes[];
+	 *	struct sample_id		sample_id;
+	 * };
+	 */
+	PERF_RECORD_TEXT_POKE			= 20,
+
 	PERF_RECORD_MAX,			/* non-ABI */
 };
 
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 856d98c36f56..9b8f92500833 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -394,6 +394,7 @@ static atomic_t nr_switch_events __read_mostly;
 static atomic_t nr_ksymbol_events __read_mostly;
 static atomic_t nr_bpf_events __read_mostly;
 static atomic_t nr_cgroup_events __read_mostly;
+static atomic_t nr_text_poke_events __read_mostly;
 
 static LIST_HEAD(pmus);
 static DEFINE_MUTEX(pmus_lock);
@@ -4575,7 +4576,7 @@ static bool is_sb_event(struct perf_event *event)
 	if (attr->mmap || attr->mmap_data || attr->mmap2 ||
 	    attr->comm || attr->comm_exec ||
 	    attr->task || attr->ksymbol ||
-	    attr->context_switch ||
+	    attr->context_switch || attr->text_poke ||
 	    attr->bpf_event)
 		return true;
 	return false;
@@ -4651,6 +4652,8 @@ static void unaccount_event(struct perf_event *event)
 		atomic_dec(&nr_ksymbol_events);
 	if (event->attr.bpf_event)
 		atomic_dec(&nr_bpf_events);
+	if (event->attr.text_poke)
+		atomic_dec(&nr_text_poke_events);
 
 	if (dec) {
 		if (!atomic_add_unless(&perf_sched_count, -1, 1))
@@ -8628,6 +8631,89 @@ void perf_event_bpf_event(struct bpf_prog *prog,
 	perf_iterate_sb(perf_event_bpf_output, &bpf_event, NULL);
 }
 
+struct perf_text_poke_event {
+	const void		*old_bytes;
+	const void		*new_bytes;
+	size_t			pad;
+	u16			old_len;
+	u16			new_len;
+
+	struct {
+		struct perf_event_header	header;
+
+		u64				addr;
+	} event_id;
+};
+
+static int perf_event_text_poke_match(struct perf_event *event)
+{
+	return event->attr.text_poke;
+}
+
+static void perf_event_text_poke_output(struct perf_event *event, void *data)
+{
+	struct perf_text_poke_event *text_poke_event = data;
+	struct perf_output_handle handle;
+	struct perf_sample_data sample;
+	u64 padding = 0;
+	int ret;
+
+	if (!perf_event_text_poke_match(event))
+		return;
+
+	perf_event_header__init_id(&text_poke_event->event_id.header, &sample, event);
+
+	ret = perf_output_begin(&handle, event, text_poke_event->event_id.header.size);
+	if (ret)
+		return;
+
+	perf_output_put(&handle, text_poke_event->event_id);
+	perf_output_put(&handle, text_poke_event->old_len);
+	perf_output_put(&handle, text_poke_event->new_len);
+
+	__output_copy(&handle, text_poke_event->old_bytes, text_poke_event->old_len);
+	__output_copy(&handle, text_poke_event->new_bytes, text_poke_event->new_len);
+
+	if (text_poke_event->pad)
+		__output_copy(&handle, &padding, text_poke_event->pad);
+
+	perf_event__output_id_sample(event, &handle, &sample);
+
+	perf_output_end(&handle);
+}
+
+void perf_event_text_poke(const void *addr, const void *old_bytes,
+			  size_t old_len, const void *new_bytes, size_t new_len)
+{
+	struct perf_text_poke_event text_poke_event;
+	size_t tot, pad;
+
+	if (!atomic_read(&nr_text_poke_events))
+		return;
+
+	tot  = sizeof(text_poke_event.old_len) + old_len;
+	tot += sizeof(text_poke_event.new_len) + new_len;
+	pad  = ALIGN(tot, sizeof(u64)) - tot;
+
+	text_poke_event = (struct perf_text_poke_event){
+		.old_bytes    = old_bytes,
+		.new_bytes    = new_bytes,
+		.pad          = pad,
+		.old_len      = old_len,
+		.new_len      = new_len,
+		.event_id  = {
+			.header = {
+				.type = PERF_RECORD_TEXT_POKE,
+				.misc = PERF_RECORD_MISC_KERNEL,
+				.size = sizeof(text_poke_event.event_id) + tot + pad,
+			},
+			.addr = (unsigned long)addr,
+		},
+	};
+
+	perf_iterate_sb(perf_event_text_poke_output, &text_poke_event, NULL);
+}
+
 void perf_event_itrace_started(struct perf_event *event)
 {
 	event->attach_state |= PERF_ATTACH_ITRACE;
@@ -10945,6 +11031,8 @@ static void account_event(struct perf_event *event)
 		atomic_inc(&nr_ksymbol_events);
 	if (event->attr.bpf_event)
 		atomic_inc(&nr_bpf_events);
+	if (event->attr.text_poke)
+		atomic_inc(&nr_text_poke_events);
 
 	if (inc) {
 		/*

From d769811ca93303deb1d8729d20cceaca7051a6f1 Mon Sep 17 00:00:00 2001
From: Adrian Hunter <adrian.hunter@intel.com>
Date: Tue, 12 May 2020 15:19:09 +0300
Subject: [PATCH 007/502] perf/x86: Add support for perf text poke event for
 text_poke_bp_batch() callers

Add support for perf text poke event for text_poke_bp_batch() callers. That
includes jump labels. See comments for more details.

Signed-off-by: Adrian Hunter <adrian.hunter@intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/20200512121922.8997-3-adrian.hunter@intel.com
---
 arch/x86/kernel/alternative.c | 37 ++++++++++++++++++++++++++++++++++-
 1 file changed, 36 insertions(+), 1 deletion(-)

diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index 8fd39ff74a49..f94c9f371411 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -3,6 +3,7 @@
 
 #include <linux/module.h>
 #include <linux/sched.h>
+#include <linux/perf_event.h>
 #include <linux/mutex.h>
 #include <linux/list.h>
 #include <linux/stringify.h>
@@ -1001,6 +1002,7 @@ struct text_poke_loc {
 	s32 rel32;
 	u8 opcode;
 	const u8 text[POKE_MAX_OPCODE_SIZE];
+	u8 old;
 };
 
 struct bp_patching_desc {
@@ -1168,8 +1170,10 @@ static void text_poke_bp_batch(struct text_poke_loc *tp, unsigned int nr_entries
 	/*
 	 * First step: add a int3 trap to the address that will be patched.
 	 */
-	for (i = 0; i < nr_entries; i++)
+	for (i = 0; i < nr_entries; i++) {
+		tp[i].old = *(u8 *)text_poke_addr(&tp[i]);
 		text_poke(text_poke_addr(&tp[i]), &int3, INT3_INSN_SIZE);
+	}
 
 	text_poke_sync();
 
@@ -1177,14 +1181,45 @@ static void text_poke_bp_batch(struct text_poke_loc *tp, unsigned int nr_entries
 	 * Second step: update all but the first byte of the patched range.
 	 */
 	for (do_sync = 0, i = 0; i < nr_entries; i++) {
+		u8 old[POKE_MAX_OPCODE_SIZE] = { tp[i].old, };
 		int len = text_opcode_size(tp[i].opcode);
 
 		if (len - INT3_INSN_SIZE > 0) {
+			memcpy(old + INT3_INSN_SIZE,
+			       text_poke_addr(&tp[i]) + INT3_INSN_SIZE,
+			       len - INT3_INSN_SIZE);
 			text_poke(text_poke_addr(&tp[i]) + INT3_INSN_SIZE,
 				  (const char *)tp[i].text + INT3_INSN_SIZE,
 				  len - INT3_INSN_SIZE);
 			do_sync++;
 		}
+
+		/*
+		 * Emit a perf event to record the text poke, primarily to
+		 * support Intel PT decoding which must walk the executable code
+		 * to reconstruct the trace. The flow up to here is:
+		 *   - write INT3 byte
+		 *   - IPI-SYNC
+		 *   - write instruction tail
+		 * At this point the actual control flow will be through the
+		 * INT3 and handler and not hit the old or new instruction.
+		 * Intel PT outputs FUP/TIP packets for the INT3, so the flow
+		 * can still be decoded. Subsequently:
+		 *   - emit RECORD_TEXT_POKE with the new instruction
+		 *   - IPI-SYNC
+		 *   - write first byte
+		 *   - IPI-SYNC
+		 * So before the text poke event timestamp, the decoder will see
+		 * either the old instruction flow or FUP/TIP of INT3. After the
+		 * text poke event timestamp, the decoder will see either the
+		 * new instruction flow or FUP/TIP of INT3. Thus decoders can
+		 * use the timestamp as the point at which to modify the
+		 * executable code.
+		 * The old instruction is recorded so that the event can be
+		 * processed forwards or backwards.
+		 */
+		perf_event_text_poke(text_poke_addr(&tp[i]), old, len,
+				     tp[i].text, len);
 	}
 
 	if (do_sync) {

From d002b8bc6dbc20e9043e279196cff8795dba05fe Mon Sep 17 00:00:00 2001
From: Adrian Hunter <adrian.hunter@intel.com>
Date: Thu, 28 May 2020 11:00:58 +0300
Subject: [PATCH 008/502] kprobes: Add symbols for kprobe insn pages

Symbols are needed for tools to describe instruction addresses. Pages
allocated for kprobe's purposes need symbols to be created for them.
Add such symbols to be visible via /proc/kallsyms.

Note: kprobe insn pages are not used if ftrace is configured. To see the
effect of this patch, the kernel must be configured with:

	# CONFIG_FUNCTION_TRACER is not set
	CONFIG_KPROBES=y

and for optimised kprobes:

	CONFIG_OPTPROBES=y

Example on x86:

	# perf probe __schedule
	Added new event:
	  probe:__schedule     (on __schedule)
	# cat /proc/kallsyms | grep '\[__builtin__kprobes\]'
	ffffffffc00d4000 t kprobe_insn_page     [__builtin__kprobes]
	ffffffffc00d6000 t kprobe_optinsn_page  [__builtin__kprobes]

Note: This patch adds "__builtin__kprobes" as a module name in
/proc/kallsyms for symbols for pages allocated for kprobes' purposes, even
though "__builtin__kprobes" is not a module.

Signed-off-by: Adrian Hunter <adrian.hunter@intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Masami Hiramatsu <mhiramat@kernel.org>
Link: https://lkml.kernel.org/r/20200528080058.20230-1-adrian.hunter@intel.com
---
 include/linux/kprobes.h | 15 ++++++++++++++
 kernel/kallsyms.c       | 37 +++++++++++++++++++++++++++++----
 kernel/kprobes.c        | 45 +++++++++++++++++++++++++++++++++++++++++
 3 files changed, 93 insertions(+), 4 deletions(-)

diff --git a/include/linux/kprobes.h b/include/linux/kprobes.h
index 594265bfd390..13fc58a74c04 100644
--- a/include/linux/kprobes.h
+++ b/include/linux/kprobes.h
@@ -242,6 +242,7 @@ struct kprobe_insn_cache {
 	struct mutex mutex;
 	void *(*alloc)(void);	/* allocate insn page */
 	void (*free)(void *);	/* free insn page */
+	const char *sym;	/* symbol for insn pages */
 	struct list_head pages; /* list of kprobe_insn_page */
 	size_t insn_size;	/* size of instruction slot */
 	int nr_garbage;
@@ -272,6 +273,10 @@ static inline bool is_kprobe_##__name##_slot(unsigned long addr)	\
 {									\
 	return __is_insn_slot_addr(&kprobe_##__name##_slots, addr);	\
 }
+#define KPROBE_INSN_PAGE_SYM		"kprobe_insn_page"
+#define KPROBE_OPTINSN_PAGE_SYM		"kprobe_optinsn_page"
+int kprobe_cache_get_kallsym(struct kprobe_insn_cache *c, unsigned int *symnum,
+			     unsigned long *value, char *type, char *sym);
 #else /* __ARCH_WANT_KPROBES_INSN_SLOT */
 #define DEFINE_INSN_CACHE_OPS(__name)					\
 static inline bool is_kprobe_##__name##_slot(unsigned long addr)	\
@@ -373,6 +378,11 @@ void dump_kprobe(struct kprobe *kp);
 void *alloc_insn_page(void);
 void free_insn_page(void *page);
 
+int kprobe_get_kallsym(unsigned int symnum, unsigned long *value, char *type,
+		       char *sym);
+
+int arch_kprobe_get_kallsym(unsigned int *symnum, unsigned long *value,
+			    char *type, char *sym);
 #else /* !CONFIG_KPROBES: */
 
 static inline int kprobes_built_in(void)
@@ -435,6 +445,11 @@ static inline bool within_kprobe_blacklist(unsigned long addr)
 {
 	return true;
 }
+static inline int kprobe_get_kallsym(unsigned int symnum, unsigned long *value,
+				     char *type, char *sym)
+{
+	return -ERANGE;
+}
 #endif /* CONFIG_KPROBES */
 static inline int disable_kretprobe(struct kretprobe *rp)
 {
diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c
index 16c8c605f4b0..c6cc293c0e67 100644
--- a/kernel/kallsyms.c
+++ b/kernel/kallsyms.c
@@ -24,6 +24,7 @@
 #include <linux/slab.h>
 #include <linux/filter.h>
 #include <linux/ftrace.h>
+#include <linux/kprobes.h>
 #include <linux/compiler.h>
 
 /*
@@ -437,6 +438,7 @@ struct kallsym_iter {
 	loff_t pos_arch_end;
 	loff_t pos_mod_end;
 	loff_t pos_ftrace_mod_end;
+	loff_t pos_bpf_end;
 	unsigned long value;
 	unsigned int nameoff; /* If iterating in core kernel symbols. */
 	char type;
@@ -496,11 +498,33 @@ static int get_ksymbol_ftrace_mod(struct kallsym_iter *iter)
 
 static int get_ksymbol_bpf(struct kallsym_iter *iter)
 {
+	int ret;
+
 	strlcpy(iter->module_name, "bpf", MODULE_NAME_LEN);
 	iter->exported = 0;
-	return bpf_get_kallsym(iter->pos - iter->pos_ftrace_mod_end,
-			       &iter->value, &iter->type,
-			       iter->name) < 0 ? 0 : 1;
+	ret = bpf_get_kallsym(iter->pos - iter->pos_ftrace_mod_end,
+			      &iter->value, &iter->type,
+			      iter->name);
+	if (ret < 0) {
+		iter->pos_bpf_end = iter->pos;
+		return 0;
+	}
+
+	return 1;
+}
+
+/*
+ * This uses "__builtin__kprobes" as a module name for symbols for pages
+ * allocated for kprobes' purposes, even though "__builtin__kprobes" is not a
+ * module.
+ */
+static int get_ksymbol_kprobe(struct kallsym_iter *iter)
+{
+	strlcpy(iter->module_name, "__builtin__kprobes", MODULE_NAME_LEN);
+	iter->exported = 0;
+	return kprobe_get_kallsym(iter->pos - iter->pos_bpf_end,
+				  &iter->value, &iter->type,
+				  iter->name) < 0 ? 0 : 1;
 }
 
 /* Returns space to next name. */
@@ -527,6 +551,7 @@ static void reset_iter(struct kallsym_iter *iter, loff_t new_pos)
 		iter->pos_arch_end = 0;
 		iter->pos_mod_end = 0;
 		iter->pos_ftrace_mod_end = 0;
+		iter->pos_bpf_end = 0;
 	}
 }
 
@@ -551,7 +576,11 @@ static int update_iter_mod(struct kallsym_iter *iter, loff_t pos)
 	    get_ksymbol_ftrace_mod(iter))
 		return 1;
 
-	return get_ksymbol_bpf(iter);
+	if ((!iter->pos_bpf_end || iter->pos_bpf_end > pos) &&
+	    get_ksymbol_bpf(iter))
+		return 1;
+
+	return get_ksymbol_kprobe(iter);
 }
 
 /* Returns false if pos at or past end of file. */
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 50cd84f53df0..058c0be3464b 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -118,6 +118,7 @@ struct kprobe_insn_cache kprobe_insn_slots = {
 	.mutex = __MUTEX_INITIALIZER(kprobe_insn_slots.mutex),
 	.alloc = alloc_insn_page,
 	.free = free_insn_page,
+	.sym = KPROBE_INSN_PAGE_SYM,
 	.pages = LIST_HEAD_INIT(kprobe_insn_slots.pages),
 	.insn_size = MAX_INSN_SIZE,
 	.nr_garbage = 0,
@@ -290,12 +291,34 @@ bool __is_insn_slot_addr(struct kprobe_insn_cache *c, unsigned long addr)
 	return ret;
 }
 
+int kprobe_cache_get_kallsym(struct kprobe_insn_cache *c, unsigned int *symnum,
+			     unsigned long *value, char *type, char *sym)
+{
+	struct kprobe_insn_page *kip;
+	int ret = -ERANGE;
+
+	rcu_read_lock();
+	list_for_each_entry_rcu(kip, &c->pages, list) {
+		if ((*symnum)--)
+			continue;
+		strlcpy(sym, c->sym, KSYM_NAME_LEN);
+		*type = 't';
+		*value = (unsigned long)kip->insns;
+		ret = 0;
+		break;
+	}
+	rcu_read_unlock();
+
+	return ret;
+}
+
 #ifdef CONFIG_OPTPROBES
 /* For optimized_kprobe buffer */
 struct kprobe_insn_cache kprobe_optinsn_slots = {
 	.mutex = __MUTEX_INITIALIZER(kprobe_optinsn_slots.mutex),
 	.alloc = alloc_insn_page,
 	.free = free_insn_page,
+	.sym = KPROBE_OPTINSN_PAGE_SYM,
 	.pages = LIST_HEAD_INIT(kprobe_optinsn_slots.pages),
 	/* .insn_size is initialized later */
 	.nr_garbage = 0,
@@ -2197,6 +2220,28 @@ static void kprobe_remove_ksym_blacklist(unsigned long entry)
 	kprobe_remove_area_blacklist(entry, entry + 1);
 }
 
+int __weak arch_kprobe_get_kallsym(unsigned int *symnum, unsigned long *value,
+				   char *type, char *sym)
+{
+	return -ERANGE;
+}
+
+int kprobe_get_kallsym(unsigned int symnum, unsigned long *value, char *type,
+		       char *sym)
+{
+#ifdef __ARCH_WANT_KPROBES_INSN_SLOT
+	if (!kprobe_cache_get_kallsym(&kprobe_insn_slots, &symnum, value, type, sym))
+		return 0;
+#ifdef CONFIG_OPTPROBES
+	if (!kprobe_cache_get_kallsym(&kprobe_optinsn_slots, &symnum, value, type, sym))
+		return 0;
+#endif
+#endif
+	if (!arch_kprobe_get_kallsym(&symnum, value, type, sym))
+		return 0;
+	return -ERANGE;
+}
+
 int __init __weak arch_populate_kprobe_blacklist(void)
 {
 	return 0;

From 69e49088692899d25dedfa22f00dfb9761e86ed7 Mon Sep 17 00:00:00 2001
From: Adrian Hunter <adrian.hunter@intel.com>
Date: Tue, 12 May 2020 15:19:11 +0300
Subject: [PATCH 009/502] kprobes: Add perf ksymbol events for kprobe insn
 pages

Symbols are needed for tools to describe instruction addresses. Pages
allocated for kprobe's purposes need symbols to be created for them.
Add such symbols to be visible via perf ksymbol events.

Signed-off-by: Adrian Hunter <adrian.hunter@intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Masami Hiramatsu <mhiramat@kernel.org>
Link: https://lkml.kernel.org/r/20200512121922.8997-5-adrian.hunter@intel.com
---
 include/uapi/linux/perf_event.h |  5 +++++
 kernel/kprobes.c                | 12 ++++++++++++
 2 files changed, 17 insertions(+)

diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h
index e5bee6c17b86..e1a4179144a1 100644
--- a/include/uapi/linux/perf_event.h
+++ b/include/uapi/linux/perf_event.h
@@ -1049,6 +1049,11 @@ enum perf_event_type {
 enum perf_record_ksymbol_type {
 	PERF_RECORD_KSYMBOL_TYPE_UNKNOWN	= 0,
 	PERF_RECORD_KSYMBOL_TYPE_BPF		= 1,
+	/*
+	 * Out of line code such as kprobe-replaced instructions or optimized
+	 * kprobes.
+	 */
+	PERF_RECORD_KSYMBOL_TYPE_OOL		= 2,
 	PERF_RECORD_KSYMBOL_TYPE_MAX		/* non-ABI */
 };
 
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 058c0be3464b..2b58740ca0f3 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -35,6 +35,7 @@
 #include <linux/ftrace.h>
 #include <linux/cpu.h>
 #include <linux/jump_label.h>
+#include <linux/perf_event.h>
 
 #include <asm/sections.h>
 #include <asm/cacheflush.h>
@@ -184,6 +185,10 @@ kprobe_opcode_t *__get_insn_slot(struct kprobe_insn_cache *c)
 	kip->cache = c;
 	list_add_rcu(&kip->list, &c->pages);
 	slot = kip->insns;
+
+	/* Record the perf ksymbol register event after adding the page */
+	perf_event_ksymbol(PERF_RECORD_KSYMBOL_TYPE_OOL, (unsigned long)kip->insns,
+			   PAGE_SIZE, false, c->sym);
 out:
 	mutex_unlock(&c->mutex);
 	return slot;
@@ -202,6 +207,13 @@ static int collect_one_slot(struct kprobe_insn_page *kip, int idx)
 		 * next time somebody inserts a probe.
 		 */
 		if (!list_is_singular(&kip->list)) {
+			/*
+			 * Record perf ksymbol unregister event before removing
+			 * the page.
+			 */
+			perf_event_ksymbol(PERF_RECORD_KSYMBOL_TYPE_OOL,
+					   (unsigned long)kip->insns, PAGE_SIZE, true,
+					   kip->cache->sym);
 			list_del_rcu(&kip->list);
 			synchronize_rcu();
 			kip->cache->free(kip->insns);

From 3e46bb40af8c12947c093efb8af56e0e921cd39b Mon Sep 17 00:00:00 2001
From: Adrian Hunter <adrian.hunter@intel.com>
Date: Tue, 12 May 2020 15:19:12 +0300
Subject: [PATCH 010/502] perf/x86: Add perf text poke events for kprobes

Add perf text poke events for kprobes. That includes:

 - the replaced instruction(s) which are executed out-of-line
   i.e. arch_copy_kprobe() and arch_remove_kprobe()

 - the INT3 that activates the kprobe
   i.e. arch_arm_kprobe() and arch_disarm_kprobe()

 - optimised kprobe function
   i.e. arch_prepare_optimized_kprobe() and
      __arch_remove_optimized_kprobe()

 - optimised kprobe
   i.e. arch_optimize_kprobes() and arch_unoptimize_kprobe()

Resulting in 8 possible text_poke events:

 0:  NULL -> probe.ainsn.insn (if ainsn.boostable && !kp.post_handler)
					arch_copy_kprobe()

 1:  old0 -> INT3			arch_arm_kprobe()

 // boosted kprobe active

 2:  NULL -> optprobe_trampoline	arch_prepare_optimized_kprobe()

 3:  INT3,old1,old2,old3,old4 -> JMP32	arch_optimize_kprobes()

 // optprobe active

 4:  JMP32 -> INT3,old1,old2,old3,old4

 // optprobe disabled and kprobe active (this sometimes goes back to 3)
					arch_unoptimize_kprobe()

 5:  optprobe_trampoline -> NULL	arch_remove_optimized_kprobe()

 // boosted kprobe active

 6:  INT3 -> old0			arch_disarm_kprobe()

 7:  probe.ainsn.insn -> NULL (if ainsn.boostable && !kp.post_handler)
					arch_remove_kprobe()

Signed-off-by: Adrian Hunter <adrian.hunter@intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Masami Hiramatsu <mhiramat@kernel.org>
Link: https://lkml.kernel.org/r/20200512121922.8997-6-adrian.hunter@intel.com
---
 arch/x86/include/asm/kprobes.h |  2 ++
 arch/x86/kernel/kprobes/core.c | 15 +++++++++++++-
 arch/x86/kernel/kprobes/opt.c  | 38 +++++++++++++++++++++++++++++-----
 3 files changed, 49 insertions(+), 6 deletions(-)

diff --git a/arch/x86/include/asm/kprobes.h b/arch/x86/include/asm/kprobes.h
index 073eb7ad2f56..143bc9abe99c 100644
--- a/arch/x86/include/asm/kprobes.h
+++ b/arch/x86/include/asm/kprobes.h
@@ -66,6 +66,8 @@ struct arch_specific_insn {
 	 */
 	bool boostable;
 	bool if_modifier;
+	/* Number of bytes of text poked */
+	int tp_len;
 };
 
 struct arch_optimized_insn {
diff --git a/arch/x86/kernel/kprobes/core.c b/arch/x86/kernel/kprobes/core.c
index 3bafe1bd4dc7..bcc53c0d17c1 100644
--- a/arch/x86/kernel/kprobes/core.c
+++ b/arch/x86/kernel/kprobes/core.c
@@ -33,6 +33,7 @@
 #include <linux/hardirq.h>
 #include <linux/preempt.h>
 #include <linux/sched/debug.h>
+#include <linux/perf_event.h>
 #include <linux/extable.h>
 #include <linux/kdebug.h>
 #include <linux/kallsyms.h>
@@ -471,6 +472,9 @@ static int arch_copy_kprobe(struct kprobe *p)
 	/* Also, displacement change doesn't affect the first byte */
 	p->opcode = buf[0];
 
+	p->ainsn.tp_len = len;
+	perf_event_text_poke(p->ainsn.insn, NULL, 0, buf, len);
+
 	/* OK, write back the instruction(s) into ROX insn buffer */
 	text_poke(p->ainsn.insn, buf, len);
 
@@ -502,12 +506,18 @@ int arch_prepare_kprobe(struct kprobe *p)
 
 void arch_arm_kprobe(struct kprobe *p)
 {
-	text_poke(p->addr, ((unsigned char []){INT3_INSN_OPCODE}), 1);
+	u8 int3 = INT3_INSN_OPCODE;
+
+	text_poke(p->addr, &int3, 1);
 	text_poke_sync();
+	perf_event_text_poke(p->addr, &p->opcode, 1, &int3, 1);
 }
 
 void arch_disarm_kprobe(struct kprobe *p)
 {
+	u8 int3 = INT3_INSN_OPCODE;
+
+	perf_event_text_poke(p->addr, &int3, 1, &p->opcode, 1);
 	text_poke(p->addr, &p->opcode, 1);
 	text_poke_sync();
 }
@@ -515,6 +525,9 @@ void arch_disarm_kprobe(struct kprobe *p)
 void arch_remove_kprobe(struct kprobe *p)
 {
 	if (p->ainsn.insn) {
+		/* Record the perf event before freeing the slot */
+		perf_event_text_poke(p->ainsn.insn, p->ainsn.insn,
+				     p->ainsn.tp_len, NULL, 0);
 		free_insn_slot(p->ainsn.insn, p->ainsn.boostable);
 		p->ainsn.insn = NULL;
 	}
diff --git a/arch/x86/kernel/kprobes/opt.c b/arch/x86/kernel/kprobes/opt.c
index 321c19950285..3239b6a80bce 100644
--- a/arch/x86/kernel/kprobes/opt.c
+++ b/arch/x86/kernel/kprobes/opt.c
@@ -6,6 +6,7 @@
  * Copyright (C) Hitachi Ltd., 2012
  */
 #include <linux/kprobes.h>
+#include <linux/perf_event.h>
 #include <linux/ptrace.h>
 #include <linux/string.h>
 #include <linux/slab.h>
@@ -352,8 +353,15 @@ int arch_within_optimized_kprobe(struct optimized_kprobe *op,
 static
 void __arch_remove_optimized_kprobe(struct optimized_kprobe *op, int dirty)
 {
-	if (op->optinsn.insn) {
-		free_optinsn_slot(op->optinsn.insn, dirty);
+	u8 *slot = op->optinsn.insn;
+	if (slot) {
+		int len = TMPL_END_IDX + op->optinsn.size + JMP32_INSN_SIZE;
+
+		/* Record the perf event before freeing the slot */
+		if (dirty)
+			perf_event_text_poke(slot, slot, len, NULL, 0);
+
+		free_optinsn_slot(slot, dirty);
 		op->optinsn.insn = NULL;
 		op->optinsn.size = 0;
 	}
@@ -424,8 +432,15 @@ int arch_prepare_optimized_kprobe(struct optimized_kprobe *op,
 			   (u8 *)op->kp.addr + op->optinsn.size);
 	len += JMP32_INSN_SIZE;
 
+	/*
+	 * Note	len = TMPL_END_IDX + op->optinsn.size + JMP32_INSN_SIZE is also
+	 * used in __arch_remove_optimized_kprobe().
+	 */
+
 	/* We have to use text_poke() for instruction buffer because it is RO */
+	perf_event_text_poke(slot, NULL, 0, buf, len);
 	text_poke(slot, buf, len);
+
 	ret = 0;
 out:
 	kfree(buf);
@@ -477,10 +492,23 @@ void arch_optimize_kprobes(struct list_head *oplist)
  */
 void arch_unoptimize_kprobe(struct optimized_kprobe *op)
 {
-	arch_arm_kprobe(&op->kp);
-	text_poke(op->kp.addr + INT3_INSN_SIZE,
-		  op->optinsn.copied_insn, DISP32_SIZE);
+	u8 new[JMP32_INSN_SIZE] = { INT3_INSN_OPCODE, };
+	u8 old[JMP32_INSN_SIZE];
+	u8 *addr = op->kp.addr;
+
+	memcpy(old, op->kp.addr, JMP32_INSN_SIZE);
+	memcpy(new + INT3_INSN_SIZE,
+	       op->optinsn.copied_insn,
+	       JMP32_INSN_SIZE - INT3_INSN_SIZE);
+
+	text_poke(addr, new, INT3_INSN_SIZE);
 	text_poke_sync();
+	text_poke(addr + INT3_INSN_SIZE,
+		  new + INT3_INSN_SIZE,
+		  JMP32_INSN_SIZE - INT3_INSN_SIZE);
+	text_poke_sync();
+
+	perf_event_text_poke(op->kp.addr, old, JMP32_INSN_SIZE, new, JMP32_INSN_SIZE);
 }
 
 /*

From fc0ea795f53c8d7040fa42471f74fe51d78d0834 Mon Sep 17 00:00:00 2001
From: Adrian Hunter <adrian.hunter@intel.com>
Date: Tue, 12 May 2020 15:19:13 +0300
Subject: [PATCH 011/502] ftrace: Add symbols for ftrace trampolines

Symbols are needed for tools to describe instruction addresses. Pages
allocated for ftrace's purposes need symbols to be created for them.
Add such symbols to be visible via /proc/kallsyms.

Example on x86 with CONFIG_DYNAMIC_FTRACE=y

	# echo function > /sys/kernel/debug/tracing/current_tracer
	# cat /proc/kallsyms | grep '\[__builtin__ftrace\]'
	ffffffffc0238000 t ftrace_trampoline    [__builtin__ftrace]

Note: This patch adds "__builtin__ftrace" as a module name in /proc/kallsyms for
symbols for pages allocated for ftrace's purposes, even though "__builtin__ftrace"
is not a module.

Signed-off-by: Adrian Hunter <adrian.hunter@intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/20200512121922.8997-7-adrian.hunter@intel.com
---
 include/linux/ftrace.h | 12 ++++---
 kernel/kallsyms.c      |  5 +++
 kernel/trace/ftrace.c  | 77 ++++++++++++++++++++++++++++++++++++++++--
 3 files changed, 88 insertions(+), 6 deletions(-)

diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index e339dac91ee6..ce2c06f72e86 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -58,9 +58,6 @@ struct ftrace_direct_func;
 const char *
 ftrace_mod_address_lookup(unsigned long addr, unsigned long *size,
 		   unsigned long *off, char **modname, char *sym);
-int ftrace_mod_get_kallsym(unsigned int symnum, unsigned long *value,
-			   char *type, char *name,
-			   char *module_name, int *exported);
 #else
 static inline const char *
 ftrace_mod_address_lookup(unsigned long addr, unsigned long *size,
@@ -68,6 +65,13 @@ ftrace_mod_address_lookup(unsigned long addr, unsigned long *size,
 {
 	return NULL;
 }
+#endif
+
+#if defined(CONFIG_FUNCTION_TRACER) && defined(CONFIG_DYNAMIC_FTRACE)
+int ftrace_mod_get_kallsym(unsigned int symnum, unsigned long *value,
+			   char *type, char *name,
+			   char *module_name, int *exported);
+#else
 static inline int ftrace_mod_get_kallsym(unsigned int symnum, unsigned long *value,
 					 char *type, char *name,
 					 char *module_name, int *exported)
@@ -76,7 +80,6 @@ static inline int ftrace_mod_get_kallsym(unsigned int symnum, unsigned long *val
 }
 #endif
 
-
 #ifdef CONFIG_FUNCTION_TRACER
 
 extern int ftrace_enabled;
@@ -207,6 +210,7 @@ struct ftrace_ops {
 	struct ftrace_ops_hash		old_hash;
 	unsigned long			trampoline;
 	unsigned long			trampoline_size;
+	struct list_head		list;
 #endif
 };
 
diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c
index c6cc293c0e67..834bfdc43235 100644
--- a/kernel/kallsyms.c
+++ b/kernel/kallsyms.c
@@ -482,6 +482,11 @@ static int get_ksymbol_mod(struct kallsym_iter *iter)
 	return 1;
 }
 
+/*
+ * ftrace_mod_get_kallsym() may also get symbols for pages allocated for ftrace
+ * purposes. In that case "__builtin__ftrace" is used as a module name, even
+ * though "__builtin__ftrace" is not a module.
+ */
 static int get_ksymbol_ftrace_mod(struct kallsym_iter *iter)
 {
 	int ret = ftrace_mod_get_kallsym(iter->pos - iter->pos_mod_end,
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index c163c3531faf..31675b209db2 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -2764,6 +2764,38 @@ void __weak arch_ftrace_trampoline_free(struct ftrace_ops *ops)
 {
 }
 
+/* List of trace_ops that have allocated trampolines */
+static LIST_HEAD(ftrace_ops_trampoline_list);
+
+static void ftrace_add_trampoline_to_kallsyms(struct ftrace_ops *ops)
+{
+	lockdep_assert_held(&ftrace_lock);
+	list_add_rcu(&ops->list, &ftrace_ops_trampoline_list);
+}
+
+static void ftrace_remove_trampoline_from_kallsyms(struct ftrace_ops *ops)
+{
+	lockdep_assert_held(&ftrace_lock);
+	list_del_rcu(&ops->list);
+}
+
+/*
+ * "__builtin__ftrace" is used as a module name in /proc/kallsyms for symbols
+ * for pages allocated for ftrace purposes, even though "__builtin__ftrace" is
+ * not a module.
+ */
+#define FTRACE_TRAMPOLINE_MOD "__builtin__ftrace"
+#define FTRACE_TRAMPOLINE_SYM "ftrace_trampoline"
+
+static void ftrace_trampoline_free(struct ftrace_ops *ops)
+{
+	if (ops && (ops->flags & FTRACE_OPS_FL_ALLOC_TRAMP) &&
+	    ops->trampoline)
+		ftrace_remove_trampoline_from_kallsyms(ops);
+
+	arch_ftrace_trampoline_free(ops);
+}
+
 static void ftrace_startup_enable(int command)
 {
 	if (saved_ftrace_func != ftrace_trace_function) {
@@ -2934,7 +2966,7 @@ int ftrace_shutdown(struct ftrace_ops *ops, int command)
 			synchronize_rcu_tasks();
 
  free_ops:
-		arch_ftrace_trampoline_free(ops);
+		ftrace_trampoline_free(ops);
 	}
 
 	return 0;
@@ -6178,6 +6210,27 @@ struct ftrace_mod_map {
 	unsigned int		num_funcs;
 };
 
+static int ftrace_get_trampoline_kallsym(unsigned int symnum,
+					 unsigned long *value, char *type,
+					 char *name, char *module_name,
+					 int *exported)
+{
+	struct ftrace_ops *op;
+
+	list_for_each_entry_rcu(op, &ftrace_ops_trampoline_list, list) {
+		if (!op->trampoline || symnum--)
+			continue;
+		*value = op->trampoline;
+		*type = 't';
+		strlcpy(name, FTRACE_TRAMPOLINE_SYM, KSYM_NAME_LEN);
+		strlcpy(module_name, FTRACE_TRAMPOLINE_MOD, MODULE_NAME_LEN);
+		*exported = 0;
+		return 0;
+	}
+
+	return -ERANGE;
+}
+
 #ifdef CONFIG_MODULES
 
 #define next_to_ftrace_page(p) container_of(p, struct ftrace_page, next)
@@ -6514,6 +6567,7 @@ int ftrace_mod_get_kallsym(unsigned int symnum, unsigned long *value,
 {
 	struct ftrace_mod_map *mod_map;
 	struct ftrace_mod_func *mod_func;
+	int ret;
 
 	preempt_disable();
 	list_for_each_entry_rcu(mod_map, &ftrace_mod_maps, list) {
@@ -6540,8 +6594,10 @@ int ftrace_mod_get_kallsym(unsigned int symnum, unsigned long *value,
 		WARN_ON(1);
 		break;
 	}
+	ret = ftrace_get_trampoline_kallsym(symnum, value, type, name,
+					    module_name, exported);
 	preempt_enable();
-	return -ERANGE;
+	return ret;
 }
 
 #else
@@ -6553,6 +6609,18 @@ allocate_ftrace_mod_map(struct module *mod,
 {
 	return NULL;
 }
+int ftrace_mod_get_kallsym(unsigned int symnum, unsigned long *value,
+			   char *type, char *name, char *module_name,
+			   int *exported)
+{
+	int ret;
+
+	preempt_disable();
+	ret = ftrace_get_trampoline_kallsym(symnum, value, type, name,
+					    module_name, exported);
+	preempt_enable();
+	return ret;
+}
 #endif /* CONFIG_MODULES */
 
 struct ftrace_init_func {
@@ -6733,7 +6801,12 @@ void __weak arch_ftrace_update_trampoline(struct ftrace_ops *ops)
 
 static void ftrace_update_trampoline(struct ftrace_ops *ops)
 {
+	unsigned long trampoline = ops->trampoline;
+
 	arch_ftrace_update_trampoline(ops);
+	if (ops->trampoline && ops->trampoline != trampoline &&
+	    (ops->flags & FTRACE_OPS_FL_ALLOC_TRAMP))
+		ftrace_add_trampoline_to_kallsyms(ops);
 }
 
 void ftrace_init_trace_array(struct trace_array *tr)

From dd9ddf466ad7a5d2e247925d81ebb0b878bf3b76 Mon Sep 17 00:00:00 2001
From: Adrian Hunter <adrian.hunter@intel.com>
Date: Tue, 12 May 2020 15:19:14 +0300
Subject: [PATCH 012/502] ftrace: Add perf ksymbol events for ftrace
 trampolines

Symbols are needed for tools to describe instruction addresses. Pages
allocated for ftrace's purposes need symbols to be created for them.
Add such symbols to be visible via perf ksymbol events.

Signed-off-by: Adrian Hunter <adrian.hunter@intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/20200512121922.8997-8-adrian.hunter@intel.com
---
 include/uapi/linux/perf_event.h |  2 +-
 kernel/trace/ftrace.c           | 14 ++++++++++++--
 2 files changed, 13 insertions(+), 3 deletions(-)

diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h
index e1a4179144a1..52ca2093831c 100644
--- a/include/uapi/linux/perf_event.h
+++ b/include/uapi/linux/perf_event.h
@@ -1051,7 +1051,7 @@ enum perf_record_ksymbol_type {
 	PERF_RECORD_KSYMBOL_TYPE_BPF		= 1,
 	/*
 	 * Out of line code such as kprobe-replaced instructions or optimized
-	 * kprobes.
+	 * kprobes or ftrace trampolines.
 	 */
 	PERF_RECORD_KSYMBOL_TYPE_OOL		= 2,
 	PERF_RECORD_KSYMBOL_TYPE_MAX		/* non-ABI */
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 31675b209db2..2baaf7716537 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -2790,8 +2790,13 @@ static void ftrace_remove_trampoline_from_kallsyms(struct ftrace_ops *ops)
 static void ftrace_trampoline_free(struct ftrace_ops *ops)
 {
 	if (ops && (ops->flags & FTRACE_OPS_FL_ALLOC_TRAMP) &&
-	    ops->trampoline)
+	    ops->trampoline) {
+		perf_event_ksymbol(PERF_RECORD_KSYMBOL_TYPE_OOL,
+				   ops->trampoline, ops->trampoline_size,
+				   true, FTRACE_TRAMPOLINE_SYM);
+		/* Remove from kallsyms after the perf events */
 		ftrace_remove_trampoline_from_kallsyms(ops);
+	}
 
 	arch_ftrace_trampoline_free(ops);
 }
@@ -6805,8 +6810,13 @@ static void ftrace_update_trampoline(struct ftrace_ops *ops)
 
 	arch_ftrace_update_trampoline(ops);
 	if (ops->trampoline && ops->trampoline != trampoline &&
-	    (ops->flags & FTRACE_OPS_FL_ALLOC_TRAMP))
+	    (ops->flags & FTRACE_OPS_FL_ALLOC_TRAMP)) {
+		/* Add to kallsyms before the perf events */
 		ftrace_add_trampoline_to_kallsyms(ops);
+		perf_event_ksymbol(PERF_RECORD_KSYMBOL_TYPE_OOL,
+				   ops->trampoline, ops->trampoline_size, false,
+				   FTRACE_TRAMPOLINE_SYM);
+	}
 }
 
 void ftrace_init_trace_array(struct trace_array *tr)

From 548e1f6c76e1eb80ba29edd4286b9b9f2c37f5bf Mon Sep 17 00:00:00 2001
From: Adrian Hunter <adrian.hunter@intel.com>
Date: Tue, 12 May 2020 15:19:15 +0300
Subject: [PATCH 013/502] ftrace: Add perf text poke events for ftrace
 trampolines

Add perf text poke events for ftrace trampolines when created and when
freed.

There can be 3 text_poke events for ftrace trampolines:

1. NULL -> trampoline
   By ftrace_update_trampoline() when !ops->trampoline
   Trampoline created

2. [e.g. on x86] CALL rel32 -> CALL rel32
   By arch_ftrace_update_trampoline() when ops->trampoline and
                        ops->flags & FTRACE_OPS_FL_ALLOC_TRAMP
   [e.g. on x86] via text_poke_bp() which generates text poke events
   Trampoline-called function target updated

3. trampoline -> NULL
   By ftrace_trampoline_free() when ops->trampoline and
                 ops->flags & FTRACE_OPS_FL_ALLOC_TRAMP
   Trampoline freed

Signed-off-by: Adrian Hunter <adrian.hunter@intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/20200512121922.8997-9-adrian.hunter@intel.com
---
 kernel/trace/ftrace.c | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 2baaf7716537..d6bba734ab72 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -2791,6 +2791,13 @@ static void ftrace_trampoline_free(struct ftrace_ops *ops)
 {
 	if (ops && (ops->flags & FTRACE_OPS_FL_ALLOC_TRAMP) &&
 	    ops->trampoline) {
+		/*
+		 * Record the text poke event before the ksymbol unregister
+		 * event.
+		 */
+		perf_event_text_poke((void *)ops->trampoline,
+				     (void *)ops->trampoline,
+				     ops->trampoline_size, NULL, 0);
 		perf_event_ksymbol(PERF_RECORD_KSYMBOL_TYPE_OOL,
 				   ops->trampoline, ops->trampoline_size,
 				   true, FTRACE_TRAMPOLINE_SYM);
@@ -6816,6 +6823,13 @@ static void ftrace_update_trampoline(struct ftrace_ops *ops)
 		perf_event_ksymbol(PERF_RECORD_KSYMBOL_TYPE_OOL,
 				   ops->trampoline, ops->trampoline_size, false,
 				   FTRACE_TRAMPOLINE_SYM);
+		/*
+		 * Record the perf text poke event after the ksymbol register
+		 * event.
+		 */
+		perf_event_text_poke((void *)ops->trampoline, NULL, 0,
+				     (void *)ops->trampoline,
+				     ops->trampoline_size);
 	}
 }
 

From 2af834f1faab3f1e218fcbcab70a399121620d62 Mon Sep 17 00:00:00 2001
From: Kan Liang <kan.liang@linux.intel.com>
Date: Thu, 28 May 2020 08:19:27 -0700
Subject: [PATCH 014/502] perf/x86/intel/uncore: Fix oops when counting IMC
 uncore events on some TGL

When counting IMC uncore events on some TGL machines, an oops will be
triggered.
  [ 393.101262] BUG: unable to handle page fault for address:
  ffffb45200e15858
  [ 393.101269] #PF: supervisor read access in kernel mode
  [ 393.101271] #PF: error_code(0x0000) - not-present page

Current perf uncore driver still use the IMC MAP SIZE inherited from
SNB, which is 0x6000.
However, the offset of IMC uncore counters is larger than 0x6000,
e.g. 0xd8a0.

Enlarge the IMC MAP SIZE for TGL to 0xe000.

Fixes: fdb64822443e ("perf/x86: Add Intel Tiger Lake uncore support")
Reported-by: Ammy Yi <ammy.yi@intel.com>
Signed-off-by: Kan Liang <kan.liang@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Tested-by: Ammy Yi <ammy.yi@intel.com>
Tested-by: Chao Qin <chao.qin@intel.com>
Link: https://lkml.kernel.org/r/1590679169-61823-1-git-send-email-kan.liang@linux.intel.com
---
 arch/x86/events/intel/uncore_snb.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/arch/x86/events/intel/uncore_snb.c b/arch/x86/events/intel/uncore_snb.c
index 5c4036710b7a..d5ae3a822193 100644
--- a/arch/x86/events/intel/uncore_snb.c
+++ b/arch/x86/events/intel/uncore_snb.c
@@ -1151,6 +1151,7 @@ static struct pci_dev *tgl_uncore_get_mc_dev(void)
 }
 
 #define TGL_UNCORE_MMIO_IMC_MEM_OFFSET		0x10000
+#define TGL_UNCORE_PCI_IMC_MAP_SIZE		0xe000
 
 static void tgl_uncore_imc_freerunning_init_box(struct intel_uncore_box *box)
 {
@@ -1178,7 +1179,7 @@ static void tgl_uncore_imc_freerunning_init_box(struct intel_uncore_box *box)
 	addr |= ((resource_size_t)mch_bar << 32);
 #endif
 
-	box->io_addr = ioremap(addr, SNB_UNCORE_PCI_IMC_MAP_SIZE);
+	box->io_addr = ioremap(addr, TGL_UNCORE_PCI_IMC_MAP_SIZE);
 }
 
 static struct intel_uncore_ops tgl_uncore_imc_freerunning_ops = {

From 1b94d31de422399421422af0e63c9685e7485901 Mon Sep 17 00:00:00 2001
From: Kan Liang <kan.liang@linux.intel.com>
Date: Thu, 28 May 2020 08:19:28 -0700
Subject: [PATCH 015/502] perf/x86/intel/uncore: Record the size of mapped area

Perf cannot validate an address before the actual access to MMIO space
of some uncore units, e.g. IMC on TGL. Accessing an invalid address,
which exceeds mapped area, can trigger oops.

Perf never records the size of mapped area. Generic functions, e.g.
uncore_mmio_read_counter(), cannot get the correct size for address
validation.

Add mmio_map_size in intel_uncore_type to record the size of mapped
area. Print warning message if ioremap fails.

Signed-off-by: Kan Liang <kan.liang@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/1590679169-61823-2-git-send-email-kan.liang@linux.intel.com
---
 arch/x86/events/intel/uncore.h       |  1 +
 arch/x86/events/intel/uncore_snb.c   | 13 +++++++++++--
 arch/x86/events/intel/uncore_snbep.c | 11 +++++++++--
 3 files changed, 21 insertions(+), 4 deletions(-)

diff --git a/arch/x86/events/intel/uncore.h b/arch/x86/events/intel/uncore.h
index b469ddd45515..79ff626b7ea6 100644
--- a/arch/x86/events/intel/uncore.h
+++ b/arch/x86/events/intel/uncore.h
@@ -61,6 +61,7 @@ struct intel_uncore_type {
 		unsigned msr_offset;
 		unsigned mmio_offset;
 	};
+	unsigned mmio_map_size;
 	unsigned num_shared_regs:8;
 	unsigned single_fixed:1;
 	unsigned pair_ctr_ctl:1;
diff --git a/arch/x86/events/intel/uncore_snb.c b/arch/x86/events/intel/uncore_snb.c
index d5ae3a822193..cb94ba86efd2 100644
--- a/arch/x86/events/intel/uncore_snb.c
+++ b/arch/x86/events/intel/uncore_snb.c
@@ -426,6 +426,7 @@ static const struct attribute_group snb_uncore_imc_format_group = {
 
 static void snb_uncore_imc_init_box(struct intel_uncore_box *box)
 {
+	struct intel_uncore_type *type = box->pmu->type;
 	struct pci_dev *pdev = box->pci_dev;
 	int where = SNB_UNCORE_PCI_IMC_BAR_OFFSET;
 	resource_size_t addr;
@@ -441,7 +442,10 @@ static void snb_uncore_imc_init_box(struct intel_uncore_box *box)
 
 	addr &= ~(PAGE_SIZE - 1);
 
-	box->io_addr = ioremap(addr, SNB_UNCORE_PCI_IMC_MAP_SIZE);
+	box->io_addr = ioremap(addr, type->mmio_map_size);
+	if (!box->io_addr)
+		pr_warn("perf uncore: Failed to ioremap for %s.\n", type->name);
+
 	box->hrtimer_duration = UNCORE_SNB_IMC_HRTIMER_INTERVAL;
 }
 
@@ -597,6 +601,7 @@ static struct intel_uncore_type snb_uncore_imc = {
 	.num_counters   = 2,
 	.num_boxes	= 1,
 	.num_freerunning_types	= SNB_PCI_UNCORE_IMC_FREERUNNING_TYPE_MAX,
+	.mmio_map_size	= SNB_UNCORE_PCI_IMC_MAP_SIZE,
 	.freerunning	= snb_uncore_imc_freerunning,
 	.event_descs	= snb_uncore_imc_events,
 	.format_group	= &snb_uncore_imc_format_group,
@@ -1157,6 +1162,7 @@ static void tgl_uncore_imc_freerunning_init_box(struct intel_uncore_box *box)
 {
 	struct pci_dev *pdev = tgl_uncore_get_mc_dev();
 	struct intel_uncore_pmu *pmu = box->pmu;
+	struct intel_uncore_type *type = pmu->type;
 	resource_size_t addr;
 	u32 mch_bar;
 
@@ -1179,7 +1185,9 @@ static void tgl_uncore_imc_freerunning_init_box(struct intel_uncore_box *box)
 	addr |= ((resource_size_t)mch_bar << 32);
 #endif
 
-	box->io_addr = ioremap(addr, TGL_UNCORE_PCI_IMC_MAP_SIZE);
+	box->io_addr = ioremap(addr, type->mmio_map_size);
+	if (!box->io_addr)
+		pr_warn("perf uncore: Failed to ioremap for %s.\n", type->name);
 }
 
 static struct intel_uncore_ops tgl_uncore_imc_freerunning_ops = {
@@ -1205,6 +1213,7 @@ static struct intel_uncore_type tgl_uncore_imc_free_running = {
 	.num_counters		= 3,
 	.num_boxes		= 2,
 	.num_freerunning_types	= TGL_MMIO_UNCORE_IMC_FREERUNNING_TYPE_MAX,
+	.mmio_map_size		= TGL_UNCORE_PCI_IMC_MAP_SIZE,
 	.freerunning		= tgl_uncore_imc_freerunning,
 	.ops			= &tgl_uncore_imc_freerunning_ops,
 	.event_descs		= tgl_uncore_imc_events,
diff --git a/arch/x86/events/intel/uncore_snbep.c b/arch/x86/events/intel/uncore_snbep.c
index 07652fa20ebb..bffb7554f4fb 100644
--- a/arch/x86/events/intel/uncore_snbep.c
+++ b/arch/x86/events/intel/uncore_snbep.c
@@ -4421,6 +4421,7 @@ static void __snr_uncore_mmio_init_box(struct intel_uncore_box *box,
 				       unsigned int box_ctl, int mem_offset)
 {
 	struct pci_dev *pdev = snr_uncore_get_mc_dev(box->dieid);
+	struct intel_uncore_type *type = box->pmu->type;
 	resource_size_t addr;
 	u32 pci_dword;
 
@@ -4435,9 +4436,11 @@ static void __snr_uncore_mmio_init_box(struct intel_uncore_box *box,
 
 	addr += box_ctl;
 
-	box->io_addr = ioremap(addr, SNR_IMC_MMIO_SIZE);
-	if (!box->io_addr)
+	box->io_addr = ioremap(addr, type->mmio_map_size);
+	if (!box->io_addr) {
+		pr_warn("perf uncore: Failed to ioremap for %s.\n", type->name);
 		return;
+	}
 
 	writel(IVBEP_PMON_BOX_CTL_INT, box->io_addr);
 }
@@ -4530,6 +4533,7 @@ static struct intel_uncore_type snr_uncore_imc = {
 	.event_mask	= SNBEP_PMON_RAW_EVENT_MASK,
 	.box_ctl	= SNR_IMC_MMIO_PMON_BOX_CTL,
 	.mmio_offset	= SNR_IMC_MMIO_OFFSET,
+	.mmio_map_size	= SNR_IMC_MMIO_SIZE,
 	.ops		= &snr_uncore_mmio_ops,
 	.format_group	= &skx_uncore_format_group,
 };
@@ -4570,6 +4574,7 @@ static struct intel_uncore_type snr_uncore_imc_free_running = {
 	.num_counters		= 3,
 	.num_boxes		= 1,
 	.num_freerunning_types	= SNR_IMC_FREERUNNING_TYPE_MAX,
+	.mmio_map_size		= SNR_IMC_MMIO_SIZE,
 	.freerunning		= snr_imc_freerunning,
 	.ops			= &snr_uncore_imc_freerunning_ops,
 	.event_descs		= snr_uncore_imc_freerunning_events,
@@ -4987,6 +4992,7 @@ static struct intel_uncore_type icx_uncore_imc = {
 	.event_mask	= SNBEP_PMON_RAW_EVENT_MASK,
 	.box_ctl	= SNR_IMC_MMIO_PMON_BOX_CTL,
 	.mmio_offset	= SNR_IMC_MMIO_OFFSET,
+	.mmio_map_size	= SNR_IMC_MMIO_SIZE,
 	.ops		= &icx_uncore_mmio_ops,
 	.format_group	= &skx_uncore_format_group,
 };
@@ -5044,6 +5050,7 @@ static struct intel_uncore_type icx_uncore_imc_free_running = {
 	.num_counters		= 5,
 	.num_boxes		= 4,
 	.num_freerunning_types	= ICX_IMC_FREERUNNING_TYPE_MAX,
+	.mmio_map_size		= SNR_IMC_MMIO_SIZE,
 	.freerunning		= icx_imc_freerunning,
 	.ops			= &icx_uncore_imc_freerunning_ops,
 	.event_descs		= icx_uncore_imc_freerunning_events,

From f01719730bbe04b90ae60c7e9d2b6d3533308502 Mon Sep 17 00:00:00 2001
From: Kan Liang <kan.liang@linux.intel.com>
Date: Thu, 28 May 2020 08:19:29 -0700
Subject: [PATCH 016/502] perf/x86/intel/uncore: Validate MMIO address before
 accessing

An oops will be triggered, if perf tries to access an invalid address
which exceeds the mapped area.

Check the address before the actual access to MMIO sapce of an uncore
unit.

Suggested-by: David Laight <David.Laight@ACULAB.COM>
Signed-off-by: Kan Liang <kan.liang@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/1590679169-61823-3-git-send-email-kan.liang@linux.intel.com
---
 arch/x86/events/intel/uncore.c       |  3 +++
 arch/x86/events/intel/uncore.h       | 12 ++++++++++++
 arch/x86/events/intel/uncore_snbep.c |  6 ++++++
 3 files changed, 21 insertions(+)

diff --git a/arch/x86/events/intel/uncore.c b/arch/x86/events/intel/uncore.c
index b9c28765bf33..cbe32d592aad 100644
--- a/arch/x86/events/intel/uncore.c
+++ b/arch/x86/events/intel/uncore.c
@@ -132,6 +132,9 @@ u64 uncore_mmio_read_counter(struct intel_uncore_box *box,
 	if (!box->io_addr)
 		return 0;
 
+	if (!uncore_mmio_is_valid_offset(box, event->hw.event_base))
+		return 0;
+
 	return readq(box->io_addr + event->hw.event_base);
 }
 
diff --git a/arch/x86/events/intel/uncore.h b/arch/x86/events/intel/uncore.h
index 79ff626b7ea6..7859ac01f7a5 100644
--- a/arch/x86/events/intel/uncore.h
+++ b/arch/x86/events/intel/uncore.h
@@ -197,6 +197,18 @@ static inline bool uncore_pmc_freerunning(int idx)
 	return idx == UNCORE_PMC_IDX_FREERUNNING;
 }
 
+static inline bool uncore_mmio_is_valid_offset(struct intel_uncore_box *box,
+					       unsigned long offset)
+{
+	if (offset < box->pmu->type->mmio_map_size)
+		return true;
+
+	pr_warn_once("perf uncore: Invalid offset 0x%lx exceeds mapped area of %s.\n",
+		     offset, box->pmu->type->name);
+
+	return false;
+}
+
 static inline
 unsigned int uncore_mmio_box_ctl(struct intel_uncore_box *box)
 {
diff --git a/arch/x86/events/intel/uncore_snbep.c b/arch/x86/events/intel/uncore_snbep.c
index bffb7554f4fb..045c2d2231d2 100644
--- a/arch/x86/events/intel/uncore_snbep.c
+++ b/arch/x86/events/intel/uncore_snbep.c
@@ -4483,6 +4483,9 @@ static void snr_uncore_mmio_enable_event(struct intel_uncore_box *box,
 	if (!box->io_addr)
 		return;
 
+	if (!uncore_mmio_is_valid_offset(box, hwc->config_base))
+		return;
+
 	writel(hwc->config | SNBEP_PMON_CTL_EN,
 	       box->io_addr + hwc->config_base);
 }
@@ -4495,6 +4498,9 @@ static void snr_uncore_mmio_disable_event(struct intel_uncore_box *box,
 	if (!box->io_addr)
 		return;
 
+	if (!uncore_mmio_is_valid_offset(box, hwc->config_base))
+		return;
+
 	writel(hwc->config, box->io_addr + hwc->config_base);
 }
 

From 19a39819818dee57e363bd44bd096e2e940a456b Mon Sep 17 00:00:00 2001
From: Roman Sudarikov <roman.sudarikov@linux.intel.com>
Date: Mon, 1 Jun 2020 11:35:41 +0300
Subject: [PATCH 017/502] perf/x86/intel/uncore: Expose an Uncore unit to PMON
 mapping
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Each Uncore unit type, by its nature, can be mapped to its own context -
which platform component each PMON block of that type is supposed to
monitor.

Intel® Xeon® Scalable processor family (code name Skylake-SP) makes
significant changes in the integrated I/O (IIO) architecture. The new
solution introduces IIO stacks which are responsible for managing traffic
between the PCIe domain and the Mesh domain. Each IIO stack has its own
PMON block and can handle either DMI port, x16 PCIe root port, MCP-Link
or various built-in accelerators. IIO PMON blocks allow concurrent
monitoring of I/O flows up to 4 x4 bifurcation within each IIO stack.

Software is supposed to program required perf counters within each IIO
stack and gather performance data. The tricky thing here is that IIO PMON
reports data per IIO stack but users have no idea what IIO stacks are -
they only know devices which are connected to the platform.

Understanding IIO stack concept to find which IIO stack that particular
IO device is connected to, or to identify an IIO PMON block to program
for monitoring specific IIO stack assumes a lot of implicit knowledge
about given Intel server platform architecture.

Usage example:
    ls /sys/devices/uncore_<type>_<pmu_idx>/die*

Signed-off-by: Alexander Antonov <alexander.antonov@linux.intel.com>
Signed-off-by: Roman Sudarikov <roman.sudarikov@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Kan Liang <kan.liang@linux.intel.com>
Reviewed-by: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Link: https://lkml.kernel.org/r/20200601083543.30011-2-alexander.antonov@linux.intel.com
---
 arch/x86/events/intel/uncore.c |  8 ++++++++
 arch/x86/events/intel/uncore.h | 12 ++++++++++++
 2 files changed, 20 insertions(+)

diff --git a/arch/x86/events/intel/uncore.c b/arch/x86/events/intel/uncore.c
index cbe32d592aad..49255e656e85 100644
--- a/arch/x86/events/intel/uncore.c
+++ b/arch/x86/events/intel/uncore.c
@@ -846,10 +846,12 @@ static int uncore_pmu_register(struct intel_uncore_pmu *pmu)
 			.read		= uncore_pmu_event_read,
 			.module		= THIS_MODULE,
 			.capabilities	= PERF_PMU_CAP_NO_EXCLUDE,
+			.attr_update	= pmu->type->attr_update,
 		};
 	} else {
 		pmu->pmu = *pmu->type->pmu;
 		pmu->pmu.attr_groups = pmu->type->attr_groups;
+		pmu->pmu.attr_update = pmu->type->attr_update;
 	}
 
 	if (pmu->type->num_boxes == 1) {
@@ -890,6 +892,9 @@ static void uncore_type_exit(struct intel_uncore_type *type)
 	struct intel_uncore_pmu *pmu = type->pmus;
 	int i;
 
+	if (type->cleanup_mapping)
+		type->cleanup_mapping(type);
+
 	if (pmu) {
 		for (i = 0; i < type->num_boxes; i++, pmu++) {
 			uncore_pmu_unregister(pmu);
@@ -957,6 +962,9 @@ static int __init uncore_type_init(struct intel_uncore_type *type, bool setid)
 
 	type->pmu_group = &uncore_pmu_attr_group;
 
+	if (type->set_mapping)
+		type->set_mapping(type);
+
 	return 0;
 
 err:
diff --git a/arch/x86/events/intel/uncore.h b/arch/x86/events/intel/uncore.h
index 7859ac01f7a5..7caba06c7df5 100644
--- a/arch/x86/events/intel/uncore.h
+++ b/arch/x86/events/intel/uncore.h
@@ -73,7 +73,19 @@ struct intel_uncore_type {
 	struct uncore_event_desc *event_descs;
 	struct freerunning_counters *freerunning;
 	const struct attribute_group *attr_groups[4];
+	const struct attribute_group **attr_update;
 	struct pmu *pmu; /* for custom pmu ops */
+	/*
+	 * Uncore PMU would store relevant platform topology configuration here
+	 * to identify which platform component each PMON block of that type is
+	 * supposed to monitor.
+	 */
+	u64 *topology;
+	/*
+	 * Optional callbacks for managing mapping of Uncore units to PMONs
+	 */
+	int (*set_mapping)(struct intel_uncore_type *type);
+	void (*cleanup_mapping)(struct intel_uncore_type *type);
 };
 
 #define pmu_group attr_groups[0]

From 36b533bc5e3ed1039406f3b27e746b4d18f2cac1 Mon Sep 17 00:00:00 2001
From: Roman Sudarikov <roman.sudarikov@linux.intel.com>
Date: Mon, 1 Jun 2020 11:35:42 +0300
Subject: [PATCH 018/502] perf/x86/intel/uncore: Wrap the max dies calculation
 into an accessor

The accessor to return number of dies on the platform.

Signed-off-by: Alexander Antonov <alexander.antonov@linux.intel.com>
Signed-off-by: Roman Sudarikov <roman.sudarikov@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Kan Liang <kan.liang@linux.intel.com>
Reviewed-by: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Link: https://lkml.kernel.org/r/20200601083543.30011-3-alexander.antonov@linux.intel.com
---
 arch/x86/events/intel/uncore.c | 13 +++++++------
 arch/x86/events/intel/uncore.h |  3 +++
 2 files changed, 10 insertions(+), 6 deletions(-)

diff --git a/arch/x86/events/intel/uncore.c b/arch/x86/events/intel/uncore.c
index 49255e656e85..d5c6d3b340c5 100644
--- a/arch/x86/events/intel/uncore.c
+++ b/arch/x86/events/intel/uncore.c
@@ -16,7 +16,7 @@ struct pci_driver *uncore_pci_driver;
 DEFINE_RAW_SPINLOCK(pci2phy_map_lock);
 struct list_head pci2phy_map_head = LIST_HEAD_INIT(pci2phy_map_head);
 struct pci_extra_dev *uncore_extra_pci_dev;
-static int max_dies;
+int __uncore_max_dies;
 
 /* mask of cpus that collect uncore events */
 static cpumask_t uncore_cpu_mask;
@@ -108,7 +108,7 @@ struct intel_uncore_box *uncore_pmu_to_box(struct intel_uncore_pmu *pmu, int cpu
 	 * The unsigned check also catches the '-1' return value for non
 	 * existent mappings in the topology map.
 	 */
-	return dieid < max_dies ? pmu->boxes[dieid] : NULL;
+	return dieid < uncore_max_dies() ? pmu->boxes[dieid] : NULL;
 }
 
 u64 uncore_msr_read_counter(struct intel_uncore_box *box, struct perf_event *event)
@@ -882,7 +882,7 @@ static void uncore_free_boxes(struct intel_uncore_pmu *pmu)
 {
 	int die;
 
-	for (die = 0; die < max_dies; die++)
+	for (die = 0; die < uncore_max_dies(); die++)
 		kfree(pmu->boxes[die]);
 	kfree(pmu->boxes);
 }
@@ -923,7 +923,7 @@ static int __init uncore_type_init(struct intel_uncore_type *type, bool setid)
 	if (!pmus)
 		return -ENOMEM;
 
-	size = max_dies * sizeof(struct intel_uncore_box *);
+	size = uncore_max_dies() * sizeof(struct intel_uncore_box *);
 
 	for (i = 0; i < type->num_boxes; i++) {
 		pmus[i].func_id	= setid ? i : -1;
@@ -1123,7 +1123,7 @@ static int __init uncore_pci_init(void)
 	size_t size;
 	int ret;
 
-	size = max_dies * sizeof(struct pci_extra_dev);
+	size = uncore_max_dies() * sizeof(struct pci_extra_dev);
 	uncore_extra_pci_dev = kzalloc(size, GFP_KERNEL);
 	if (!uncore_extra_pci_dev) {
 		ret = -ENOMEM;
@@ -1552,7 +1552,8 @@ static int __init intel_uncore_init(void)
 	if (boot_cpu_has(X86_FEATURE_HYPERVISOR))
 		return -ENODEV;
 
-	max_dies = topology_max_packages() * topology_max_die_per_package();
+	__uncore_max_dies =
+		topology_max_packages() * topology_max_die_per_package();
 
 	uncore_init = (struct intel_uncore_init_fun *)id->driver_data;
 	if (uncore_init->pci_init) {
diff --git a/arch/x86/events/intel/uncore.h b/arch/x86/events/intel/uncore.h
index 7caba06c7df5..594a2fe20de9 100644
--- a/arch/x86/events/intel/uncore.h
+++ b/arch/x86/events/intel/uncore.h
@@ -182,6 +182,9 @@ int uncore_pcibus_to_physid(struct pci_bus *bus);
 ssize_t uncore_event_show(struct kobject *kobj,
 			  struct kobj_attribute *attr, char *buf);
 
+extern int __uncore_max_dies;
+#define uncore_max_dies()	(__uncore_max_dies)
+
 #define INTEL_UNCORE_EVENT_DESC(_name, _config)			\
 {								\
 	.attr	= __ATTR(_name, 0444, uncore_event_show, NULL),	\

From bb42b3d39781d7fcd3be7f9f9bf11b6661b5fdf1 Mon Sep 17 00:00:00 2001
From: Roman Sudarikov <roman.sudarikov@linux.intel.com>
Date: Mon, 1 Jun 2020 11:35:43 +0300
Subject: [PATCH 019/502] perf/x86/intel/uncore: Expose an Uncore unit to IIO
 PMON mapping
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Current version supports a server line starting Intel® Xeon® Processor
Scalable Family and introduces mapping for IIO Uncore units only.
Other units can be added on demand.

IIO stack to PMON mapping is exposed through:
    /sys/devices/uncore_iio_<pmu_idx>/dieX
    where dieX is file which holds "Segment:Root Bus" for PCIe root port,
    which can be monitored by that IIO PMON block.

Details are explained in Documentation/ABI/testing/sysfs-devices-mapping

Reported-by: kbuild test robot <lkp@intel.com>
Signed-off-by: Alexander Antonov <alexander.antonov@linux.intel.com>
Signed-off-by: Roman Sudarikov <roman.sudarikov@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Kan Liang <kan.liang@linux.intel.com>
Reviewed-by: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Link: https://lkml.kernel.org/r/20200601083543.30011-4-alexander.antonov@linux.intel.com
---
 .../ABI/testing/sysfs-devices-mapping         |  33 +++
 arch/x86/events/intel/uncore.h                |   9 +
 arch/x86/events/intel/uncore_snbep.c          | 191 ++++++++++++++++++
 3 files changed, 233 insertions(+)
 create mode 100644 Documentation/ABI/testing/sysfs-devices-mapping

diff --git a/Documentation/ABI/testing/sysfs-devices-mapping b/Documentation/ABI/testing/sysfs-devices-mapping
new file mode 100644
index 000000000000..490ccfd67f12
--- /dev/null
+++ b/Documentation/ABI/testing/sysfs-devices-mapping
@@ -0,0 +1,33 @@
+What:           /sys/devices/uncore_iio_x/dieX
+Date:           February 2020
+Contact:        Roman Sudarikov <roman.sudarikov@linux.intel.com>
+Description:
+                Each IIO stack (PCIe root port) has its own IIO PMON block, so
+                each dieX file (where X is die number) holds "Segment:Root Bus"
+                for PCIe root port, which can be monitored by that IIO PMON
+                block.
+                For example, on 4-die Xeon platform with up to 6 IIO stacks per
+                die and, therefore, 6 IIO PMON blocks per die, the mapping of
+                IIO PMON block 0 exposes as the following:
+
+                $ ls /sys/devices/uncore_iio_0/die*
+                -r--r--r-- /sys/devices/uncore_iio_0/die0
+                -r--r--r-- /sys/devices/uncore_iio_0/die1
+                -r--r--r-- /sys/devices/uncore_iio_0/die2
+                -r--r--r-- /sys/devices/uncore_iio_0/die3
+
+                $ tail /sys/devices/uncore_iio_0/die*
+                ==> /sys/devices/uncore_iio_0/die0 <==
+                0000:00
+                ==> /sys/devices/uncore_iio_0/die1 <==
+                0000:40
+                ==> /sys/devices/uncore_iio_0/die2 <==
+                0000:80
+                ==> /sys/devices/uncore_iio_0/die3 <==
+                0000:c0
+
+                Which means:
+                IIO PMU 0 on die 0 belongs to PCI RP on bus 0x00, domain 0x0000
+                IIO PMU 0 on die 1 belongs to PCI RP on bus 0x40, domain 0x0000
+                IIO PMU 0 on die 2 belongs to PCI RP on bus 0x80, domain 0x0000
+                IIO PMU 0 on die 3 belongs to PCI RP on bus 0xc0, domain 0x0000
diff --git a/arch/x86/events/intel/uncore.h b/arch/x86/events/intel/uncore.h
index 594a2fe20de9..105fdc69825e 100644
--- a/arch/x86/events/intel/uncore.h
+++ b/arch/x86/events/intel/uncore.h
@@ -182,6 +182,15 @@ int uncore_pcibus_to_physid(struct pci_bus *bus);
 ssize_t uncore_event_show(struct kobject *kobj,
 			  struct kobj_attribute *attr, char *buf);
 
+static inline struct intel_uncore_pmu *dev_to_uncore_pmu(struct device *dev)
+{
+	return container_of(dev_get_drvdata(dev), struct intel_uncore_pmu, pmu);
+}
+
+#define to_device_attribute(n)	container_of(n, struct device_attribute, attr)
+#define to_dev_ext_attribute(n)	container_of(n, struct dev_ext_attribute, attr)
+#define attr_to_ext_attr(n)	to_dev_ext_attribute(to_device_attribute(n))
+
 extern int __uncore_max_dies;
 #define uncore_max_dies()	(__uncore_max_dies)
 
diff --git a/arch/x86/events/intel/uncore_snbep.c b/arch/x86/events/intel/uncore_snbep.c
index 045c2d2231d2..62e88ad919ff 100644
--- a/arch/x86/events/intel/uncore_snbep.c
+++ b/arch/x86/events/intel/uncore_snbep.c
@@ -273,6 +273,30 @@
 #define SKX_CPUNODEID			0xc0
 #define SKX_GIDNIDMAP			0xd4
 
+/*
+ * The CPU_BUS_NUMBER MSR returns the values of the respective CPUBUSNO CSR
+ * that BIOS programmed. MSR has package scope.
+ * |  Bit  |  Default  |  Description
+ * | [63]  |    00h    | VALID - When set, indicates the CPU bus
+ *                       numbers have been initialized. (RO)
+ * |[62:48]|    ---    | Reserved
+ * |[47:40]|    00h    | BUS_NUM_5 — Return the bus number BIOS assigned
+ *                       CPUBUSNO(5). (RO)
+ * |[39:32]|    00h    | BUS_NUM_4 — Return the bus number BIOS assigned
+ *                       CPUBUSNO(4). (RO)
+ * |[31:24]|    00h    | BUS_NUM_3 — Return the bus number BIOS assigned
+ *                       CPUBUSNO(3). (RO)
+ * |[23:16]|    00h    | BUS_NUM_2 — Return the bus number BIOS assigned
+ *                       CPUBUSNO(2). (RO)
+ * |[15:8] |    00h    | BUS_NUM_1 — Return the bus number BIOS assigned
+ *                       CPUBUSNO(1). (RO)
+ * | [7:0] |    00h    | BUS_NUM_0 — Return the bus number BIOS assigned
+ *                       CPUBUSNO(0). (RO)
+ */
+#define SKX_MSR_CPU_BUS_NUMBER		0x300
+#define SKX_MSR_CPU_BUS_VALID_BIT	(1ULL << 63)
+#define BUS_NUM_STRIDE			8
+
 /* SKX CHA */
 #define SKX_CHA_MSR_PMON_BOX_FILTER_TID		(0x1ffULL << 0)
 #define SKX_CHA_MSR_PMON_BOX_FILTER_LINK	(0xfULL << 9)
@@ -3612,6 +3636,170 @@ static struct intel_uncore_ops skx_uncore_iio_ops = {
 	.read_counter		= uncore_msr_read_counter,
 };
 
+static inline u8 skx_iio_stack(struct intel_uncore_pmu *pmu, int die)
+{
+	return pmu->type->topology[die] >> (pmu->pmu_idx * BUS_NUM_STRIDE);
+}
+
+static umode_t
+skx_iio_mapping_visible(struct kobject *kobj, struct attribute *attr, int die)
+{
+	struct intel_uncore_pmu *pmu = dev_to_uncore_pmu(kobj_to_dev(kobj));
+
+	/* Root bus 0x00 is valid only for die 0 AND pmu_idx = 0. */
+	return (!skx_iio_stack(pmu, die) && pmu->pmu_idx) ? 0 : attr->mode;
+}
+
+static ssize_t skx_iio_mapping_show(struct device *dev,
+				struct device_attribute *attr, char *buf)
+{
+	struct pci_bus *bus = pci_find_next_bus(NULL);
+	struct intel_uncore_pmu *uncore_pmu = dev_to_uncore_pmu(dev);
+	struct dev_ext_attribute *ea = to_dev_ext_attribute(attr);
+	long die = (long)ea->var;
+
+	/*
+	 * Current implementation is for single segment configuration hence it's
+	 * safe to take the segment value from the first available root bus.
+	 */
+	return sprintf(buf, "%04x:%02x\n", pci_domain_nr(bus),
+					   skx_iio_stack(uncore_pmu, die));
+}
+
+static int skx_msr_cpu_bus_read(int cpu, u64 *topology)
+{
+	u64 msr_value;
+
+	if (rdmsrl_on_cpu(cpu, SKX_MSR_CPU_BUS_NUMBER, &msr_value) ||
+			!(msr_value & SKX_MSR_CPU_BUS_VALID_BIT))
+		return -ENXIO;
+
+	*topology = msr_value;
+
+	return 0;
+}
+
+static int die_to_cpu(int die)
+{
+	int res = 0, cpu, current_die;
+	/*
+	 * Using cpus_read_lock() to ensure cpu is not going down between
+	 * looking at cpu_online_mask.
+	 */
+	cpus_read_lock();
+	for_each_online_cpu(cpu) {
+		current_die = topology_logical_die_id(cpu);
+		if (current_die == die) {
+			res = cpu;
+			break;
+		}
+	}
+	cpus_read_unlock();
+	return res;
+}
+
+static int skx_iio_get_topology(struct intel_uncore_type *type)
+{
+	int i, ret;
+	struct pci_bus *bus = NULL;
+
+	/*
+	 * Verified single-segment environments only; disabled for multiple
+	 * segment topologies for now except VMD domains.
+	 * VMD domains start at 0x10000 to not clash with ACPI _SEG domains.
+	 */
+	while ((bus = pci_find_next_bus(bus))
+		&& (!pci_domain_nr(bus) || pci_domain_nr(bus) > 0xffff))
+		;
+	if (bus)
+		return -EPERM;
+
+	type->topology = kcalloc(uncore_max_dies(), sizeof(u64), GFP_KERNEL);
+	if (!type->topology)
+		return -ENOMEM;
+
+	for (i = 0; i < uncore_max_dies(); i++) {
+		ret = skx_msr_cpu_bus_read(die_to_cpu(i), &type->topology[i]);
+		if (ret) {
+			kfree(type->topology);
+			type->topology = NULL;
+			return ret;
+		}
+	}
+
+	return 0;
+}
+
+static struct attribute_group skx_iio_mapping_group = {
+	.is_visible	= skx_iio_mapping_visible,
+};
+
+static const struct attribute_group *skx_iio_attr_update[] = {
+	&skx_iio_mapping_group,
+	NULL,
+};
+
+static int skx_iio_set_mapping(struct intel_uncore_type *type)
+{
+	char buf[64];
+	int ret;
+	long die = -1;
+	struct attribute **attrs = NULL;
+	struct dev_ext_attribute *eas = NULL;
+
+	ret = skx_iio_get_topology(type);
+	if (ret)
+		return ret;
+
+	/* One more for NULL. */
+	attrs = kcalloc((uncore_max_dies() + 1), sizeof(*attrs), GFP_KERNEL);
+	if (!attrs)
+		goto err;
+
+	eas = kcalloc(uncore_max_dies(), sizeof(*eas), GFP_KERNEL);
+	if (!eas)
+		goto err;
+
+	for (die = 0; die < uncore_max_dies(); die++) {
+		sprintf(buf, "die%ld", die);
+		sysfs_attr_init(&eas[die].attr.attr);
+		eas[die].attr.attr.name = kstrdup(buf, GFP_KERNEL);
+		if (!eas[die].attr.attr.name)
+			goto err;
+		eas[die].attr.attr.mode = 0444;
+		eas[die].attr.show = skx_iio_mapping_show;
+		eas[die].attr.store = NULL;
+		eas[die].var = (void *)die;
+		attrs[die] = &eas[die].attr.attr;
+	}
+	skx_iio_mapping_group.attrs = attrs;
+
+	return 0;
+err:
+	for (; die >= 0; die--)
+		kfree(eas[die].attr.attr.name);
+	kfree(eas);
+	kfree(attrs);
+	kfree(type->topology);
+	type->attr_update = NULL;
+	return -ENOMEM;
+}
+
+static void skx_iio_cleanup_mapping(struct intel_uncore_type *type)
+{
+	struct attribute **attr = skx_iio_mapping_group.attrs;
+
+	if (!attr)
+		return;
+
+	for (; *attr; attr++)
+		kfree((*attr)->name);
+	kfree(attr_to_ext_attr(*skx_iio_mapping_group.attrs));
+	kfree(skx_iio_mapping_group.attrs);
+	skx_iio_mapping_group.attrs = NULL;
+	kfree(type->topology);
+}
+
 static struct intel_uncore_type skx_uncore_iio = {
 	.name			= "iio",
 	.num_counters		= 4,
@@ -3626,6 +3814,9 @@ static struct intel_uncore_type skx_uncore_iio = {
 	.constraints		= skx_uncore_iio_constraints,
 	.ops			= &skx_uncore_iio_ops,
 	.format_group		= &skx_uncore_iio_format_group,
+	.attr_update		= skx_iio_attr_update,
+	.set_mapping		= skx_iio_set_mapping,
+	.cleanup_mapping	= skx_iio_cleanup_mapping,
 };
 
 enum perf_uncore_iio_freerunning_type_id {

From c935cd62d3fe985d7f0ebea185d2759e8992e96f Mon Sep 17 00:00:00 2001
From: Herbert Xu <herbert@gondor.apana.org.au>
Date: Wed, 17 Jun 2020 17:17:19 +1000
Subject: [PATCH 020/502] lockdep: Split header file into lockdep and
 lockdep_types

There is a header file inclusion loop between asm-generic/bug.h
and linux/kernel.h.  This causes potential compile failurs depending
on the which file is included first.  One way of breaking this loop
is to stop spinlock_types.h from including lockdep.h.  This patch
splits lockdep.h into two files for this purpose.

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
Reviewed-by: Andy Shevchenko <andy.shevchenko@gmail.com>
Acked-by: Petr Mladek <pmladek@suse.com>
Acked-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
Link: https://lkml.kernel.org/r/E1jlSJz-0003hE-8g@fornost.hmeau.com
---
 include/linux/lockdep.h        | 178 +-----------------------------
 include/linux/lockdep_types.h  | 196 +++++++++++++++++++++++++++++++++
 include/linux/spinlock.h       |   1 +
 include/linux/spinlock_types.h |   2 +-
 4 files changed, 200 insertions(+), 177 deletions(-)
 create mode 100644 include/linux/lockdep_types.h

diff --git a/include/linux/lockdep.h b/include/linux/lockdep.h
index 8fce5c98a4b0..3b73cf84f77d 100644
--- a/include/linux/lockdep.h
+++ b/include/linux/lockdep.h
@@ -10,181 +10,20 @@
 #ifndef __LINUX_LOCKDEP_H
 #define __LINUX_LOCKDEP_H
 
+#include <linux/lockdep_types.h>
+
 struct task_struct;
-struct lockdep_map;
 
 /* for sysctl */
 extern int prove_locking;
 extern int lock_stat;
 
-#define MAX_LOCKDEP_SUBCLASSES		8UL
-
-#include <linux/types.h>
-
-enum lockdep_wait_type {
-	LD_WAIT_INV = 0,	/* not checked, catch all */
-
-	LD_WAIT_FREE,		/* wait free, rcu etc.. */
-	LD_WAIT_SPIN,		/* spin loops, raw_spinlock_t etc.. */
-
-#ifdef CONFIG_PROVE_RAW_LOCK_NESTING
-	LD_WAIT_CONFIG,		/* CONFIG_PREEMPT_LOCK, spinlock_t etc.. */
-#else
-	LD_WAIT_CONFIG = LD_WAIT_SPIN,
-#endif
-	LD_WAIT_SLEEP,		/* sleeping locks, mutex_t etc.. */
-
-	LD_WAIT_MAX,		/* must be last */
-};
-
 #ifdef CONFIG_LOCKDEP
 
 #include <linux/linkage.h>
-#include <linux/list.h>
 #include <linux/debug_locks.h>
 #include <linux/stacktrace.h>
 
-/*
- * We'd rather not expose kernel/lockdep_states.h this wide, but we do need
- * the total number of states... :-(
- */
-#define XXX_LOCK_USAGE_STATES		(1+2*4)
-
-/*
- * NR_LOCKDEP_CACHING_CLASSES ... Number of classes
- * cached in the instance of lockdep_map
- *
- * Currently main class (subclass == 0) and signle depth subclass
- * are cached in lockdep_map. This optimization is mainly targeting
- * on rq->lock. double_rq_lock() acquires this highly competitive with
- * single depth.
- */
-#define NR_LOCKDEP_CACHING_CLASSES	2
-
-/*
- * A lockdep key is associated with each lock object. For static locks we use
- * the lock address itself as the key. Dynamically allocated lock objects can
- * have a statically or dynamically allocated key. Dynamically allocated lock
- * keys must be registered before being used and must be unregistered before
- * the key memory is freed.
- */
-struct lockdep_subclass_key {
-	char __one_byte;
-} __attribute__ ((__packed__));
-
-/* hash_entry is used to keep track of dynamically allocated keys. */
-struct lock_class_key {
-	union {
-		struct hlist_node		hash_entry;
-		struct lockdep_subclass_key	subkeys[MAX_LOCKDEP_SUBCLASSES];
-	};
-};
-
-extern struct lock_class_key __lockdep_no_validate__;
-
-struct lock_trace;
-
-#define LOCKSTAT_POINTS		4
-
-/*
- * The lock-class itself. The order of the structure members matters.
- * reinit_class() zeroes the key member and all subsequent members.
- */
-struct lock_class {
-	/*
-	 * class-hash:
-	 */
-	struct hlist_node		hash_entry;
-
-	/*
-	 * Entry in all_lock_classes when in use. Entry in free_lock_classes
-	 * when not in use. Instances that are being freed are on one of the
-	 * zapped_classes lists.
-	 */
-	struct list_head		lock_entry;
-
-	/*
-	 * These fields represent a directed graph of lock dependencies,
-	 * to every node we attach a list of "forward" and a list of
-	 * "backward" graph nodes.
-	 */
-	struct list_head		locks_after, locks_before;
-
-	const struct lockdep_subclass_key *key;
-	unsigned int			subclass;
-	unsigned int			dep_gen_id;
-
-	/*
-	 * IRQ/softirq usage tracking bits:
-	 */
-	unsigned long			usage_mask;
-	const struct lock_trace		*usage_traces[XXX_LOCK_USAGE_STATES];
-
-	/*
-	 * Generation counter, when doing certain classes of graph walking,
-	 * to ensure that we check one node only once:
-	 */
-	int				name_version;
-	const char			*name;
-
-	short				wait_type_inner;
-	short				wait_type_outer;
-
-#ifdef CONFIG_LOCK_STAT
-	unsigned long			contention_point[LOCKSTAT_POINTS];
-	unsigned long			contending_point[LOCKSTAT_POINTS];
-#endif
-} __no_randomize_layout;
-
-#ifdef CONFIG_LOCK_STAT
-struct lock_time {
-	s64				min;
-	s64				max;
-	s64				total;
-	unsigned long			nr;
-};
-
-enum bounce_type {
-	bounce_acquired_write,
-	bounce_acquired_read,
-	bounce_contended_write,
-	bounce_contended_read,
-	nr_bounce_types,
-
-	bounce_acquired = bounce_acquired_write,
-	bounce_contended = bounce_contended_write,
-};
-
-struct lock_class_stats {
-	unsigned long			contention_point[LOCKSTAT_POINTS];
-	unsigned long			contending_point[LOCKSTAT_POINTS];
-	struct lock_time		read_waittime;
-	struct lock_time		write_waittime;
-	struct lock_time		read_holdtime;
-	struct lock_time		write_holdtime;
-	unsigned long			bounces[nr_bounce_types];
-};
-
-struct lock_class_stats lock_stats(struct lock_class *class);
-void clear_lock_stats(struct lock_class *class);
-#endif
-
-/*
- * Map the lock object (the lock instance) to the lock-class object.
- * This is embedded into specific lock instances:
- */
-struct lockdep_map {
-	struct lock_class_key		*key;
-	struct lock_class		*class_cache[NR_LOCKDEP_CACHING_CLASSES];
-	const char			*name;
-	short				wait_type_outer; /* can be taken in this context */
-	short				wait_type_inner; /* presents this context */
-#ifdef CONFIG_LOCK_STAT
-	int				cpu;
-	unsigned long			ip;
-#endif
-};
-
 static inline void lockdep_copy_map(struct lockdep_map *to,
 				    struct lockdep_map *from)
 {
@@ -440,8 +279,6 @@ static inline void lock_set_subclass(struct lockdep_map *lock,
 
 extern void lock_downgrade(struct lockdep_map *lock, unsigned long ip);
 
-struct pin_cookie { unsigned int val; };
-
 #define NIL_COOKIE (struct pin_cookie){ .val = 0U, }
 
 extern struct pin_cookie lock_pin_lock(struct lockdep_map *lock);
@@ -520,10 +357,6 @@ static inline void lockdep_set_selftest_task(struct task_struct *task)
 # define lockdep_reset()		do { debug_locks = 1; } while (0)
 # define lockdep_free_key_range(start, size)	do { } while (0)
 # define lockdep_sys_exit() 			do { } while (0)
-/*
- * The class key takes no space if lockdep is disabled:
- */
-struct lock_class_key { };
 
 static inline void lockdep_register_key(struct lock_class_key *key)
 {
@@ -533,11 +366,6 @@ static inline void lockdep_unregister_key(struct lock_class_key *key)
 {
 }
 
-/*
- * The lockdep_map takes no space if lockdep is disabled:
- */
-struct lockdep_map { };
-
 #define lockdep_depth(tsk)	(0)
 
 #define lockdep_is_held_type(l, r)		(1)
@@ -549,8 +377,6 @@ struct lockdep_map { };
 
 #define lockdep_recursing(tsk)			(0)
 
-struct pin_cookie { };
-
 #define NIL_COOKIE (struct pin_cookie){ }
 
 #define lockdep_pin_lock(l)			({ struct pin_cookie cookie = { }; cookie; })
diff --git a/include/linux/lockdep_types.h b/include/linux/lockdep_types.h
new file mode 100644
index 000000000000..7b9350624577
--- /dev/null
+++ b/include/linux/lockdep_types.h
@@ -0,0 +1,196 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Runtime locking correctness validator
+ *
+ *  Copyright (C) 2006,2007 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
+ *  Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra
+ *
+ * see Documentation/locking/lockdep-design.rst for more details.
+ */
+#ifndef __LINUX_LOCKDEP_TYPES_H
+#define __LINUX_LOCKDEP_TYPES_H
+
+#include <linux/types.h>
+
+#define MAX_LOCKDEP_SUBCLASSES		8UL
+
+enum lockdep_wait_type {
+	LD_WAIT_INV = 0,	/* not checked, catch all */
+
+	LD_WAIT_FREE,		/* wait free, rcu etc.. */
+	LD_WAIT_SPIN,		/* spin loops, raw_spinlock_t etc.. */
+
+#ifdef CONFIG_PROVE_RAW_LOCK_NESTING
+	LD_WAIT_CONFIG,		/* CONFIG_PREEMPT_LOCK, spinlock_t etc.. */
+#else
+	LD_WAIT_CONFIG = LD_WAIT_SPIN,
+#endif
+	LD_WAIT_SLEEP,		/* sleeping locks, mutex_t etc.. */
+
+	LD_WAIT_MAX,		/* must be last */
+};
+
+#ifdef CONFIG_LOCKDEP
+
+#include <linux/list.h>
+
+/*
+ * We'd rather not expose kernel/lockdep_states.h this wide, but we do need
+ * the total number of states... :-(
+ */
+#define XXX_LOCK_USAGE_STATES		(1+2*4)
+
+/*
+ * NR_LOCKDEP_CACHING_CLASSES ... Number of classes
+ * cached in the instance of lockdep_map
+ *
+ * Currently main class (subclass == 0) and signle depth subclass
+ * are cached in lockdep_map. This optimization is mainly targeting
+ * on rq->lock. double_rq_lock() acquires this highly competitive with
+ * single depth.
+ */
+#define NR_LOCKDEP_CACHING_CLASSES	2
+
+/*
+ * A lockdep key is associated with each lock object. For static locks we use
+ * the lock address itself as the key. Dynamically allocated lock objects can
+ * have a statically or dynamically allocated key. Dynamically allocated lock
+ * keys must be registered before being used and must be unregistered before
+ * the key memory is freed.
+ */
+struct lockdep_subclass_key {
+	char __one_byte;
+} __attribute__ ((__packed__));
+
+/* hash_entry is used to keep track of dynamically allocated keys. */
+struct lock_class_key {
+	union {
+		struct hlist_node		hash_entry;
+		struct lockdep_subclass_key	subkeys[MAX_LOCKDEP_SUBCLASSES];
+	};
+};
+
+extern struct lock_class_key __lockdep_no_validate__;
+
+struct lock_trace;
+
+#define LOCKSTAT_POINTS		4
+
+/*
+ * The lock-class itself. The order of the structure members matters.
+ * reinit_class() zeroes the key member and all subsequent members.
+ */
+struct lock_class {
+	/*
+	 * class-hash:
+	 */
+	struct hlist_node		hash_entry;
+
+	/*
+	 * Entry in all_lock_classes when in use. Entry in free_lock_classes
+	 * when not in use. Instances that are being freed are on one of the
+	 * zapped_classes lists.
+	 */
+	struct list_head		lock_entry;
+
+	/*
+	 * These fields represent a directed graph of lock dependencies,
+	 * to every node we attach a list of "forward" and a list of
+	 * "backward" graph nodes.
+	 */
+	struct list_head		locks_after, locks_before;
+
+	const struct lockdep_subclass_key *key;
+	unsigned int			subclass;
+	unsigned int			dep_gen_id;
+
+	/*
+	 * IRQ/softirq usage tracking bits:
+	 */
+	unsigned long			usage_mask;
+	const struct lock_trace		*usage_traces[XXX_LOCK_USAGE_STATES];
+
+	/*
+	 * Generation counter, when doing certain classes of graph walking,
+	 * to ensure that we check one node only once:
+	 */
+	int				name_version;
+	const char			*name;
+
+	short				wait_type_inner;
+	short				wait_type_outer;
+
+#ifdef CONFIG_LOCK_STAT
+	unsigned long			contention_point[LOCKSTAT_POINTS];
+	unsigned long			contending_point[LOCKSTAT_POINTS];
+#endif
+} __no_randomize_layout;
+
+#ifdef CONFIG_LOCK_STAT
+struct lock_time {
+	s64				min;
+	s64				max;
+	s64				total;
+	unsigned long			nr;
+};
+
+enum bounce_type {
+	bounce_acquired_write,
+	bounce_acquired_read,
+	bounce_contended_write,
+	bounce_contended_read,
+	nr_bounce_types,
+
+	bounce_acquired = bounce_acquired_write,
+	bounce_contended = bounce_contended_write,
+};
+
+struct lock_class_stats {
+	unsigned long			contention_point[LOCKSTAT_POINTS];
+	unsigned long			contending_point[LOCKSTAT_POINTS];
+	struct lock_time		read_waittime;
+	struct lock_time		write_waittime;
+	struct lock_time		read_holdtime;
+	struct lock_time		write_holdtime;
+	unsigned long			bounces[nr_bounce_types];
+};
+
+struct lock_class_stats lock_stats(struct lock_class *class);
+void clear_lock_stats(struct lock_class *class);
+#endif
+
+/*
+ * Map the lock object (the lock instance) to the lock-class object.
+ * This is embedded into specific lock instances:
+ */
+struct lockdep_map {
+	struct lock_class_key		*key;
+	struct lock_class		*class_cache[NR_LOCKDEP_CACHING_CLASSES];
+	const char			*name;
+	short				wait_type_outer; /* can be taken in this context */
+	short				wait_type_inner; /* presents this context */
+#ifdef CONFIG_LOCK_STAT
+	int				cpu;
+	unsigned long			ip;
+#endif
+};
+
+struct pin_cookie { unsigned int val; };
+
+#else /* !CONFIG_LOCKDEP */
+
+/*
+ * The class key takes no space if lockdep is disabled:
+ */
+struct lock_class_key { };
+
+/*
+ * The lockdep_map takes no space if lockdep is disabled:
+ */
+struct lockdep_map { };
+
+struct pin_cookie { };
+
+#endif /* !LOCKDEP */
+
+#endif /* __LINUX_LOCKDEP_TYPES_H */
diff --git a/include/linux/spinlock.h b/include/linux/spinlock.h
index d3770b3f9d9a..f2f12d746dbd 100644
--- a/include/linux/spinlock.h
+++ b/include/linux/spinlock.h
@@ -56,6 +56,7 @@
 #include <linux/kernel.h>
 #include <linux/stringify.h>
 #include <linux/bottom_half.h>
+#include <linux/lockdep.h>
 #include <asm/barrier.h>
 #include <asm/mmiowb.h>
 
diff --git a/include/linux/spinlock_types.h b/include/linux/spinlock_types.h
index 6102e6bff3ae..b981caafe8bf 100644
--- a/include/linux/spinlock_types.h
+++ b/include/linux/spinlock_types.h
@@ -15,7 +15,7 @@
 # include <linux/spinlock_types_up.h>
 #endif
 
-#include <linux/lockdep.h>
+#include <linux/lockdep_types.h>
 
 typedef struct raw_spinlock {
 	arch_spinlock_t raw_lock;

From 5769a351b89cd4d82016f18fa5f6c4077403564d Mon Sep 17 00:00:00 2001
From: Jiufei Xue <jiufei.xue@linux.alibaba.com>
Date: Wed, 17 Jun 2020 17:53:55 +0800
Subject: [PATCH 021/502] io_uring: change the poll type to be 32-bits

poll events should be 32-bits to cover EPOLLEXCLUSIVE.

Explicit word-swap the poll32_events for big endian to make sure the ABI
is not changed.  We call this feature IORING_FEAT_POLL_32BITS,
applications who want to use EPOLLEXCLUSIVE should check the feature bit
first.

Signed-off-by: Jiufei Xue <jiufei.xue@linux.alibaba.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c                 | 13 +++++++++----
 include/uapi/linux/io_uring.h |  4 +++-
 tools/io_uring/liburing.h     |  6 +++++-
 3 files changed, 17 insertions(+), 6 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index a78201b96179..0eb063daa9b5 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -4589,7 +4589,7 @@ static void io_poll_queue_proc(struct file *file, struct wait_queue_head *head,
 static int io_poll_add_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 {
 	struct io_poll_iocb *poll = &req->poll;
-	u16 events;
+	u32 events;
 
 	if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
 		return -EINVAL;
@@ -4598,7 +4598,10 @@ static int io_poll_add_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe
 	if (!poll->file)
 		return -EBADF;
 
-	events = READ_ONCE(sqe->poll_events);
+	events = READ_ONCE(sqe->poll32_events);
+#ifdef __BIG_ENDIAN
+	events = swahw32(events);
+#endif
 	poll->events = demangle_poll(events) | EPOLLERR | EPOLLHUP;
 
 	io_get_req_task(req);
@@ -7928,7 +7931,8 @@ static int io_uring_create(unsigned entries, struct io_uring_params *p,
 
 	p->features = IORING_FEAT_SINGLE_MMAP | IORING_FEAT_NODROP |
 			IORING_FEAT_SUBMIT_STABLE | IORING_FEAT_RW_CUR_POS |
-			IORING_FEAT_CUR_PERSONALITY | IORING_FEAT_FAST_POLL;
+			IORING_FEAT_CUR_PERSONALITY | IORING_FEAT_FAST_POLL |
+			IORING_FEAT_POLL_32BITS;
 
 	if (copy_to_user(params, p, sizeof(*p))) {
 		ret = -EFAULT;
@@ -8217,7 +8221,8 @@ static int __init io_uring_init(void)
 	BUILD_BUG_SQE_ELEM(28, /* compat */   int, rw_flags);
 	BUILD_BUG_SQE_ELEM(28, /* compat */ __u32, rw_flags);
 	BUILD_BUG_SQE_ELEM(28, __u32,  fsync_flags);
-	BUILD_BUG_SQE_ELEM(28, __u16,  poll_events);
+	BUILD_BUG_SQE_ELEM(28, /* compat */ __u16,  poll_events);
+	BUILD_BUG_SQE_ELEM(28, __u32,  poll32_events);
 	BUILD_BUG_SQE_ELEM(28, __u32,  sync_range_flags);
 	BUILD_BUG_SQE_ELEM(28, __u32,  msg_flags);
 	BUILD_BUG_SQE_ELEM(28, __u32,  timeout_flags);
diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index 92c22699a5a7..8d033961cb78 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -31,7 +31,8 @@ struct io_uring_sqe {
 	union {
 		__kernel_rwf_t	rw_flags;
 		__u32		fsync_flags;
-		__u16		poll_events;
+		__u16		poll_events;	/* compatibility */
+		__u32		poll32_events;	/* word-reversed for BE */
 		__u32		sync_range_flags;
 		__u32		msg_flags;
 		__u32		timeout_flags;
@@ -248,6 +249,7 @@ struct io_uring_params {
 #define IORING_FEAT_RW_CUR_POS		(1U << 3)
 #define IORING_FEAT_CUR_PERSONALITY	(1U << 4)
 #define IORING_FEAT_FAST_POLL		(1U << 5)
+#define IORING_FEAT_POLL_32BITS 	(1U << 6)
 
 /*
  * io_uring_register(2) opcodes and arguments
diff --git a/tools/io_uring/liburing.h b/tools/io_uring/liburing.h
index 5f305c86b892..28a837b6069d 100644
--- a/tools/io_uring/liburing.h
+++ b/tools/io_uring/liburing.h
@@ -10,6 +10,7 @@ extern "C" {
 #include <string.h>
 #include "../../include/uapi/linux/io_uring.h"
 #include <inttypes.h>
+#include <linux/swab.h>
 #include "barrier.h"
 
 /*
@@ -145,11 +146,14 @@ static inline void io_uring_prep_write_fixed(struct io_uring_sqe *sqe, int fd,
 }
 
 static inline void io_uring_prep_poll_add(struct io_uring_sqe *sqe, int fd,
-					  short poll_mask)
+					  unsigned poll_mask)
 {
 	memset(sqe, 0, sizeof(*sqe));
 	sqe->opcode = IORING_OP_POLL_ADD;
 	sqe->fd = fd;
+#if __BYTE_ORDER == __BIG_ENDIAN
+	poll_mask = __swahw32(poll_mask);
+#endif
 	sqe->poll_events = poll_mask;
 }
 

From a31eb4a2f1650fa578082ad9e9845487ecd90abe Mon Sep 17 00:00:00 2001
From: Jiufei Xue <jiufei.xue@linux.alibaba.com>
Date: Wed, 17 Jun 2020 17:53:56 +0800
Subject: [PATCH 022/502] io_uring: use EPOLLEXCLUSIVE flag to aoid thundering
 herd type behavior

Applications can pass this flag in to avoid accept thundering herd.

Signed-off-by: Jiufei Xue <jiufei.xue@linux.alibaba.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 0eb063daa9b5..311e8038ae58 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -4245,7 +4245,11 @@ static void __io_queue_proc(struct io_poll_iocb *poll, struct io_poll_table *pt,
 
 	pt->error = 0;
 	poll->head = head;
-	add_wait_queue(head, &poll->wait);
+
+	if (poll->events & EPOLLEXCLUSIVE)
+		add_wait_queue_exclusive(head, &poll->wait);
+	else
+		add_wait_queue(head, &poll->wait);
 }
 
 static void io_async_queue_proc(struct file *file, struct wait_queue_head *head,
@@ -4602,7 +4606,8 @@ static int io_poll_add_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe
 #ifdef __BIG_ENDIAN
 	events = swahw32(events);
 #endif
-	poll->events = demangle_poll(events) | EPOLLERR | EPOLLHUP;
+	poll->events = demangle_poll(events) | EPOLLERR | EPOLLHUP |
+		       (events & EPOLLEXCLUSIVE);
 
 	io_get_req_task(req);
 	return 0;

From a087e2b519929152fdde8299457e32d5a8994a7c Mon Sep 17 00:00:00 2001
From: Bijan Mottahedeh <bijan.mottahedeh@oracle.com>
Date: Tue, 16 Jun 2020 16:36:07 -0700
Subject: [PATCH 023/502] io_uring: add wrappers for memory accounting

Facilitate separation of locked memory usage reporting vs. limiting for
upcoming patches.  No functional changes.

Signed-off-by: Bijan Mottahedeh <bijan.mottahedeh@oracle.com>
[axboe: kill unnecessary () around return in io_account_mem()]
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 48 ++++++++++++++++++++++++++++--------------------
 1 file changed, 28 insertions(+), 20 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 311e8038ae58..9db9f09499d1 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -6968,12 +6968,14 @@ err:
 	return ret;
 }
 
-static void io_unaccount_mem(struct user_struct *user, unsigned long nr_pages)
+static inline void __io_unaccount_mem(struct user_struct *user,
+				      unsigned long nr_pages)
 {
 	atomic_long_sub(nr_pages, &user->locked_vm);
 }
 
-static int io_account_mem(struct user_struct *user, unsigned long nr_pages)
+static inline int __io_account_mem(struct user_struct *user,
+				   unsigned long nr_pages)
 {
 	unsigned long page_limit, cur_pages, new_pages;
 
@@ -6991,6 +6993,20 @@ static int io_account_mem(struct user_struct *user, unsigned long nr_pages)
 	return 0;
 }
 
+static void io_unaccount_mem(struct io_ring_ctx *ctx, unsigned long nr_pages)
+{
+	if (ctx->account_mem)
+		__io_unaccount_mem(ctx->user, nr_pages);
+}
+
+static int io_account_mem(struct io_ring_ctx *ctx, unsigned long nr_pages)
+{
+	if (ctx->account_mem)
+		return __io_account_mem(ctx->user, nr_pages);
+
+	return 0;
+}
+
 static void io_mem_free(void *ptr)
 {
 	struct page *page;
@@ -7065,8 +7081,7 @@ static int io_sqe_buffer_unregister(struct io_ring_ctx *ctx)
 		for (j = 0; j < imu->nr_bvecs; j++)
 			unpin_user_page(imu->bvec[j].bv_page);
 
-		if (ctx->account_mem)
-			io_unaccount_mem(ctx->user, imu->nr_bvecs);
+		io_unaccount_mem(ctx, imu->nr_bvecs);
 		kvfree(imu->bvec);
 		imu->nr_bvecs = 0;
 	}
@@ -7149,11 +7164,9 @@ static int io_sqe_buffer_register(struct io_ring_ctx *ctx, void __user *arg,
 		start = ubuf >> PAGE_SHIFT;
 		nr_pages = end - start;
 
-		if (ctx->account_mem) {
-			ret = io_account_mem(ctx->user, nr_pages);
-			if (ret)
-				goto err;
-		}
+		ret = io_account_mem(ctx, nr_pages);
+		if (ret)
+			goto err;
 
 		ret = 0;
 		if (!pages || nr_pages > got_pages) {
@@ -7166,8 +7179,7 @@ static int io_sqe_buffer_register(struct io_ring_ctx *ctx, void __user *arg,
 					GFP_KERNEL);
 			if (!pages || !vmas) {
 				ret = -ENOMEM;
-				if (ctx->account_mem)
-					io_unaccount_mem(ctx->user, nr_pages);
+				io_unaccount_mem(ctx, nr_pages);
 				goto err;
 			}
 			got_pages = nr_pages;
@@ -7177,8 +7189,7 @@ static int io_sqe_buffer_register(struct io_ring_ctx *ctx, void __user *arg,
 						GFP_KERNEL);
 		ret = -ENOMEM;
 		if (!imu->bvec) {
-			if (ctx->account_mem)
-				io_unaccount_mem(ctx->user, nr_pages);
+			io_unaccount_mem(ctx, nr_pages);
 			goto err;
 		}
 
@@ -7209,8 +7220,7 @@ static int io_sqe_buffer_register(struct io_ring_ctx *ctx, void __user *arg,
 			 */
 			if (pret > 0)
 				unpin_user_pages(pages, pret);
-			if (ctx->account_mem)
-				io_unaccount_mem(ctx->user, nr_pages);
+			io_unaccount_mem(ctx, nr_pages);
 			kvfree(imu->bvec);
 			goto err;
 		}
@@ -7315,9 +7325,7 @@ static void io_ring_ctx_free(struct io_ring_ctx *ctx)
 	io_mem_free(ctx->sq_sqes);
 
 	percpu_ref_exit(&ctx->refs);
-	if (ctx->account_mem)
-		io_unaccount_mem(ctx->user,
-				ring_pages(ctx->sq_entries, ctx->cq_entries));
+	io_unaccount_mem(ctx, ring_pages(ctx->sq_entries, ctx->cq_entries));
 	free_uid(ctx->user);
 	put_cred(ctx->creds);
 	kfree(ctx->cancel_hash);
@@ -7887,7 +7895,7 @@ static int io_uring_create(unsigned entries, struct io_uring_params *p,
 	account_mem = !capable(CAP_IPC_LOCK);
 
 	if (account_mem) {
-		ret = io_account_mem(user,
+		ret = __io_account_mem(user,
 				ring_pages(p->sq_entries, p->cq_entries));
 		if (ret) {
 			free_uid(user);
@@ -7898,7 +7906,7 @@ static int io_uring_create(unsigned entries, struct io_uring_params *p,
 	ctx = io_ring_ctx_alloc(p);
 	if (!ctx) {
 		if (account_mem)
-			io_unaccount_mem(user, ring_pages(p->sq_entries,
+			__io_unaccount_mem(user, ring_pages(p->sq_entries,
 								p->cq_entries));
 		free_uid(user);
 		return -ENOMEM;

From aad5d8da1b301fe399d65f2dcb84df2ec60caaa3 Mon Sep 17 00:00:00 2001
From: Bijan Mottahedeh <bijan.mottahedeh@oracle.com>
Date: Tue, 16 Jun 2020 16:36:08 -0700
Subject: [PATCH 024/502] io_uring: rename ctx->account_mem field

Rename account_mem to limit_name to clarify its purpose.

Signed-off-by: Bijan Mottahedeh <bijan.mottahedeh@oracle.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 9db9f09499d1..fcaf9eee3420 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -226,7 +226,7 @@ struct io_ring_ctx {
 	struct {
 		unsigned int		flags;
 		unsigned int		compat: 1;
-		unsigned int		account_mem: 1;
+		unsigned int		limit_mem: 1;
 		unsigned int		cq_overflow_flushed: 1;
 		unsigned int		drain_next: 1;
 		unsigned int		eventfd_async: 1;
@@ -6995,13 +6995,13 @@ static inline int __io_account_mem(struct user_struct *user,
 
 static void io_unaccount_mem(struct io_ring_ctx *ctx, unsigned long nr_pages)
 {
-	if (ctx->account_mem)
+	if (ctx->limit_mem)
 		__io_unaccount_mem(ctx->user, nr_pages);
 }
 
 static int io_account_mem(struct io_ring_ctx *ctx, unsigned long nr_pages)
 {
-	if (ctx->account_mem)
+	if (ctx->limit_mem)
 		return __io_account_mem(ctx->user, nr_pages);
 
 	return 0;
@@ -7853,7 +7853,7 @@ static int io_uring_create(unsigned entries, struct io_uring_params *p,
 {
 	struct user_struct *user = NULL;
 	struct io_ring_ctx *ctx;
-	bool account_mem;
+	bool limit_mem;
 	int ret;
 
 	if (!entries)
@@ -7892,9 +7892,9 @@ static int io_uring_create(unsigned entries, struct io_uring_params *p,
 	}
 
 	user = get_uid(current_user());
-	account_mem = !capable(CAP_IPC_LOCK);
+	limit_mem = !capable(CAP_IPC_LOCK);
 
-	if (account_mem) {
+	if (limit_mem) {
 		ret = __io_account_mem(user,
 				ring_pages(p->sq_entries, p->cq_entries));
 		if (ret) {
@@ -7905,14 +7905,14 @@ static int io_uring_create(unsigned entries, struct io_uring_params *p,
 
 	ctx = io_ring_ctx_alloc(p);
 	if (!ctx) {
-		if (account_mem)
+		if (limit_mem)
 			__io_unaccount_mem(user, ring_pages(p->sq_entries,
 								p->cq_entries));
 		free_uid(user);
 		return -ENOMEM;
 	}
 	ctx->compat = in_compat_syscall();
-	ctx->account_mem = account_mem;
+	ctx->limit_mem = limit_mem;
 	ctx->user = user;
 	ctx->creds = get_current_cred();
 

From 309758254ea62e07471abcaeca5b5c2173f4ebc2 Mon Sep 17 00:00:00 2001
From: Bijan Mottahedeh <bijan.mottahedeh@oracle.com>
Date: Tue, 16 Jun 2020 16:36:09 -0700
Subject: [PATCH 025/502] io_uring: report pinned memory usage

Report pinned memory usage always, regardless of whether locked memory
limit is enforced.

Signed-off-by: Bijan Mottahedeh <bijan.mottahedeh@oracle.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 22 ++++++++++++++++++----
 1 file changed, 18 insertions(+), 4 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index fcaf9eee3420..5ea55de3edef 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -6997,12 +6997,23 @@ static void io_unaccount_mem(struct io_ring_ctx *ctx, unsigned long nr_pages)
 {
 	if (ctx->limit_mem)
 		__io_unaccount_mem(ctx->user, nr_pages);
+
+	if (ctx->sqo_mm)
+		atomic64_sub(nr_pages, &ctx->sqo_mm->pinned_vm);
 }
 
 static int io_account_mem(struct io_ring_ctx *ctx, unsigned long nr_pages)
 {
-	if (ctx->limit_mem)
-		return __io_account_mem(ctx->user, nr_pages);
+	int ret;
+
+	if (ctx->limit_mem) {
+		ret = __io_account_mem(ctx->user, nr_pages);
+		if (ret)
+			return ret;
+	}
+
+	if (ctx->sqo_mm)
+		atomic64_add(nr_pages, &ctx->sqo_mm->pinned_vm);
 
 	return 0;
 }
@@ -7304,8 +7315,10 @@ static void io_destroy_buffers(struct io_ring_ctx *ctx)
 static void io_ring_ctx_free(struct io_ring_ctx *ctx)
 {
 	io_finish_async(ctx);
-	if (ctx->sqo_mm)
+	if (ctx->sqo_mm) {
 		mmdrop(ctx->sqo_mm);
+		ctx->sqo_mm = NULL;
+	}
 
 	io_iopoll_reap_events(ctx);
 	io_sqe_buffer_unregister(ctx);
@@ -7912,7 +7925,6 @@ static int io_uring_create(unsigned entries, struct io_uring_params *p,
 		return -ENOMEM;
 	}
 	ctx->compat = in_compat_syscall();
-	ctx->limit_mem = limit_mem;
 	ctx->user = user;
 	ctx->creds = get_current_cred();
 
@@ -7960,6 +7972,8 @@ static int io_uring_create(unsigned entries, struct io_uring_params *p,
 		goto err;
 
 	trace_io_uring_create(ret, ctx, p->sq_entries, p->cq_entries, p->flags);
+	io_account_mem(ctx, ring_pages(p->sq_entries, p->cq_entries));
+	ctx->limit_mem = limit_mem;
 	return ret;
 err:
 	io_ring_ctx_wait_and_kill(ctx);

From 2e0464d48f32a9e78e2aa85cbbedc77ecbb6ed60 Mon Sep 17 00:00:00 2001
From: Bijan Mottahedeh <bijan.mottahedeh@oracle.com>
Date: Tue, 16 Jun 2020 16:36:10 -0700
Subject: [PATCH 026/502] io_uring: separate reporting of ring pages from
 registered pages

Ring pages are not pinned so it is more appropriate to report them
as locked.

Signed-off-by: Bijan Mottahedeh <bijan.mottahedeh@oracle.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 43 ++++++++++++++++++++++++++++++-------------
 1 file changed, 30 insertions(+), 13 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 5ea55de3edef..10b293780703 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -880,6 +880,11 @@ static const struct io_op_def io_op_defs[] = {
 	},
 };
 
+enum io_mem_account {
+	ACCT_LOCKED,
+	ACCT_PINNED,
+};
+
 static void io_wq_submit_work(struct io_wq_work **workptr);
 static void io_cqring_fill_event(struct io_kiocb *req, long res);
 static void io_put_req(struct io_kiocb *req);
@@ -6993,16 +6998,22 @@ static inline int __io_account_mem(struct user_struct *user,
 	return 0;
 }
 
-static void io_unaccount_mem(struct io_ring_ctx *ctx, unsigned long nr_pages)
+static void io_unaccount_mem(struct io_ring_ctx *ctx, unsigned long nr_pages,
+			     enum io_mem_account acct)
 {
 	if (ctx->limit_mem)
 		__io_unaccount_mem(ctx->user, nr_pages);
 
-	if (ctx->sqo_mm)
-		atomic64_sub(nr_pages, &ctx->sqo_mm->pinned_vm);
+	if (ctx->sqo_mm) {
+		if (acct == ACCT_LOCKED)
+			ctx->sqo_mm->locked_vm -= nr_pages;
+		else if (acct == ACCT_PINNED)
+			atomic64_sub(nr_pages, &ctx->sqo_mm->pinned_vm);
+	}
 }
 
-static int io_account_mem(struct io_ring_ctx *ctx, unsigned long nr_pages)
+static int io_account_mem(struct io_ring_ctx *ctx, unsigned long nr_pages,
+			  enum io_mem_account acct)
 {
 	int ret;
 
@@ -7012,8 +7023,12 @@ static int io_account_mem(struct io_ring_ctx *ctx, unsigned long nr_pages)
 			return ret;
 	}
 
-	if (ctx->sqo_mm)
-		atomic64_add(nr_pages, &ctx->sqo_mm->pinned_vm);
+	if (ctx->sqo_mm) {
+		if (acct == ACCT_LOCKED)
+			ctx->sqo_mm->locked_vm += nr_pages;
+		else if (acct == ACCT_PINNED)
+			atomic64_add(nr_pages, &ctx->sqo_mm->pinned_vm);
+	}
 
 	return 0;
 }
@@ -7092,7 +7107,7 @@ static int io_sqe_buffer_unregister(struct io_ring_ctx *ctx)
 		for (j = 0; j < imu->nr_bvecs; j++)
 			unpin_user_page(imu->bvec[j].bv_page);
 
-		io_unaccount_mem(ctx, imu->nr_bvecs);
+		io_unaccount_mem(ctx, imu->nr_bvecs, ACCT_PINNED);
 		kvfree(imu->bvec);
 		imu->nr_bvecs = 0;
 	}
@@ -7175,7 +7190,7 @@ static int io_sqe_buffer_register(struct io_ring_ctx *ctx, void __user *arg,
 		start = ubuf >> PAGE_SHIFT;
 		nr_pages = end - start;
 
-		ret = io_account_mem(ctx, nr_pages);
+		ret = io_account_mem(ctx, nr_pages, ACCT_PINNED);
 		if (ret)
 			goto err;
 
@@ -7190,7 +7205,7 @@ static int io_sqe_buffer_register(struct io_ring_ctx *ctx, void __user *arg,
 					GFP_KERNEL);
 			if (!pages || !vmas) {
 				ret = -ENOMEM;
-				io_unaccount_mem(ctx, nr_pages);
+				io_unaccount_mem(ctx, nr_pages, ACCT_PINNED);
 				goto err;
 			}
 			got_pages = nr_pages;
@@ -7200,7 +7215,7 @@ static int io_sqe_buffer_register(struct io_ring_ctx *ctx, void __user *arg,
 						GFP_KERNEL);
 		ret = -ENOMEM;
 		if (!imu->bvec) {
-			io_unaccount_mem(ctx, nr_pages);
+			io_unaccount_mem(ctx, nr_pages, ACCT_PINNED);
 			goto err;
 		}
 
@@ -7231,7 +7246,7 @@ static int io_sqe_buffer_register(struct io_ring_ctx *ctx, void __user *arg,
 			 */
 			if (pret > 0)
 				unpin_user_pages(pages, pret);
-			io_unaccount_mem(ctx, nr_pages);
+			io_unaccount_mem(ctx, nr_pages, ACCT_PINNED);
 			kvfree(imu->bvec);
 			goto err;
 		}
@@ -7338,7 +7353,8 @@ static void io_ring_ctx_free(struct io_ring_ctx *ctx)
 	io_mem_free(ctx->sq_sqes);
 
 	percpu_ref_exit(&ctx->refs);
-	io_unaccount_mem(ctx, ring_pages(ctx->sq_entries, ctx->cq_entries));
+	io_unaccount_mem(ctx, ring_pages(ctx->sq_entries, ctx->cq_entries),
+			 ACCT_LOCKED);
 	free_uid(ctx->user);
 	put_cred(ctx->creds);
 	kfree(ctx->cancel_hash);
@@ -7972,7 +7988,8 @@ static int io_uring_create(unsigned entries, struct io_uring_params *p,
 		goto err;
 
 	trace_io_uring_create(ret, ctx, p->sq_entries, p->cq_entries, p->flags);
-	io_account_mem(ctx, ring_pages(p->sq_entries, p->cq_entries));
+	io_account_mem(ctx, ring_pages(p->sq_entries, p->cq_entries),
+		       ACCT_LOCKED);
 	ctx->limit_mem = limit_mem;
 	return ret;
 err:

From 5a473e8311b582a40c10409a0f4bb39f42aa8123 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Thu, 4 Jun 2020 11:23:39 -0600
Subject: [PATCH 027/502] block: provide plug based way of signaling forced
 no-wait semantics

Provide a way for the caller to specify that IO should be marked
with REQ_NOWAIT to avoid blocking on allocation.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-core.c       | 6 ++++++
 include/linux/blkdev.h | 1 +
 2 files changed, 7 insertions(+)

diff --git a/block/blk-core.c b/block/blk-core.c
index 03252af8c82c..62a4904db921 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -958,6 +958,7 @@ generic_make_request_checks(struct bio *bio)
 	struct request_queue *q;
 	int nr_sectors = bio_sectors(bio);
 	blk_status_t status = BLK_STS_IOERR;
+	struct blk_plug *plug;
 	char b[BDEVNAME_SIZE];
 
 	might_sleep();
@@ -971,6 +972,10 @@ generic_make_request_checks(struct bio *bio)
 		goto end_io;
 	}
 
+	plug = blk_mq_plug(q, bio);
+	if (plug && plug->nowait)
+		bio->bi_opf |= REQ_NOWAIT;
+
 	/*
 	 * For a REQ_NOWAIT based request, return -EOPNOTSUPP
 	 * if queue is not a request based queue.
@@ -1800,6 +1805,7 @@ void blk_start_plug(struct blk_plug *plug)
 	INIT_LIST_HEAD(&plug->cb_list);
 	plug->rq_count = 0;
 	plug->multiple_queues = false;
+	plug->nowait = false;
 
 	/*
 	 * Store ordering should not be needed here, since a potential
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 8fd900998b4e..6e067dca94cf 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -1189,6 +1189,7 @@ struct blk_plug {
 	struct list_head cb_list; /* md requires an unplug callback */
 	unsigned short rq_count;
 	bool multiple_queues;
+	bool nowait;
 };
 #define BLK_MAX_REQUEST_COUNT 16
 #define BLK_PLUG_FLUSH_SIZE (128 * 1024)

From ac8691c415e0ce0b8734cb6d9df2df18608eebed Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Mon, 1 Jun 2020 08:30:41 -0600
Subject: [PATCH 028/502] io_uring: always plug for any number of IOs

Currently we only plug if we're doing more than two request. We're going
to be relying on always having the plug there to pass down information,
so plug unconditionally.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 15 +++++----------
 1 file changed, 5 insertions(+), 10 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 10b293780703..de894455f6bd 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -676,7 +676,6 @@ struct io_kiocb {
 	};
 };
 
-#define IO_PLUG_THRESHOLD		2
 #define IO_IOPOLL_BATCH			8
 
 struct io_submit_state {
@@ -5914,7 +5913,7 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req,
 static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr,
 			  struct file *ring_file, int ring_fd)
 {
-	struct io_submit_state state, *statep = NULL;
+	struct io_submit_state state;
 	struct io_kiocb *link = NULL;
 	int i, submitted = 0;
 
@@ -5931,10 +5930,7 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr,
 	if (!percpu_ref_tryget_many(&ctx->refs, nr))
 		return -EAGAIN;
 
-	if (nr > IO_PLUG_THRESHOLD) {
-		io_submit_state_start(&state, nr);
-		statep = &state;
-	}
+	io_submit_state_start(&state, nr);
 
 	ctx->ring_fd = ring_fd;
 	ctx->ring_file = ring_file;
@@ -5949,14 +5945,14 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr,
 			io_consume_sqe(ctx);
 			break;
 		}
-		req = io_alloc_req(ctx, statep);
+		req = io_alloc_req(ctx, &state);
 		if (unlikely(!req)) {
 			if (!submitted)
 				submitted = -EAGAIN;
 			break;
 		}
 
-		err = io_init_req(ctx, req, sqe, statep);
+		err = io_init_req(ctx, req, sqe, &state);
 		io_consume_sqe(ctx);
 		/* will complete beyond this point, count as submitted */
 		submitted++;
@@ -5982,8 +5978,7 @@ fail_req:
 	}
 	if (link)
 		io_queue_link_head(link);
-	if (statep)
-		io_submit_state_end(&state);
+	io_submit_state_end(&state);
 
 	 /* Commit SQ ring head once we've consumed and submitted all SQEs */
 	io_commit_sqring(ctx);

From 4503b7676a2e0abe69c2f2c0d8b03aec53f2f048 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Mon, 1 Jun 2020 10:00:27 -0600
Subject: [PATCH 029/502] io_uring: catch -EIO from buffered issue request
 failure

-EIO bubbles up like -EAGAIN if we fail to allocate a request at the
lower level. Play it safe and treat it like -EAGAIN in terms of sync
retry, to avoid passing back an errant -EIO.

Catch some of these early for block based file, as non-mq devices
generally do not support NOWAIT. That saves us some overhead by
not first trying, then retrying from async context. We can go straight
to async punt instead.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 28 +++++++++++++++++++++++-----
 1 file changed, 23 insertions(+), 5 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index de894455f6bd..c5ee6d1a92d3 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -2088,6 +2088,15 @@ static struct file *__io_file_get(struct io_submit_state *state, int fd)
 	return state->file;
 }
 
+static bool io_bdev_nowait(struct block_device *bdev)
+{
+#ifdef CONFIG_BLOCK
+	return !bdev || queue_is_mq(bdev_get_queue(bdev));
+#else
+	return true;
+#endif
+}
+
 /*
  * If we tracked the file through the SCM inflight mechanism, we could support
  * any file. For now, just ensure that anything potentially problematic is done
@@ -2097,10 +2106,19 @@ static bool io_file_supports_async(struct file *file, int rw)
 {
 	umode_t mode = file_inode(file)->i_mode;
 
-	if (S_ISBLK(mode) || S_ISCHR(mode) || S_ISSOCK(mode))
-		return true;
-	if (S_ISREG(mode) && file->f_op != &io_uring_fops)
+	if (S_ISBLK(mode)) {
+		if (io_bdev_nowait(file->f_inode->i_bdev))
+			return true;
+		return false;
+	}
+	if (S_ISCHR(mode) || S_ISSOCK(mode))
 		return true;
+	if (S_ISREG(mode)) {
+		if (io_bdev_nowait(file->f_inode->i_sb->s_bdev) &&
+		    file->f_op != &io_uring_fops)
+			return true;
+		return false;
+	}
 
 	/* any ->read/write should understand O_NONBLOCK */
 	if (file->f_flags & O_NONBLOCK)
@@ -2650,7 +2668,7 @@ static int io_read(struct io_kiocb *req, bool force_nonblock)
 	iov_count = iov_iter_count(&iter);
 	ret = rw_verify_area(READ, req->file, &kiocb->ki_pos, iov_count);
 	if (!ret) {
-		ssize_t ret2;
+		ssize_t ret2 = 0;
 
 		if (req->file->f_op->read_iter)
 			ret2 = call_read_iter(req->file, kiocb, &iter);
@@ -2658,7 +2676,7 @@ static int io_read(struct io_kiocb *req, bool force_nonblock)
 			ret2 = loop_rw_iter(READ, req->file, kiocb, &iter);
 
 		/* Catch -EAGAIN return for forced non-blocking submission */
-		if (!force_nonblock || ret2 != -EAGAIN) {
+		if (!force_nonblock || (ret2 != -EAGAIN && ret2 != -EIO)) {
 			kiocb_done(kiocb, ret2);
 		} else {
 copy_iov:

From b63534c41e20b474483b4ddf47efc858c17352e0 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Thu, 4 Jun 2020 11:28:00 -0600
Subject: [PATCH 030/502] io_uring: re-issue block requests that failed because
 of resources

Mark the plug with nowait == true, which will cause requests to avoid
blocking on request allocation. If they do, we catch them and reissue
them from a task_work based handler.

Normally we can catch -EAGAIN directly, but the hard case is for split
requests. As an example, the application issues a 512KB request. The
block core will split this into 128KB if that's the max size for the
device. The first request issues just fine, but we run into -EAGAIN for
some latter splits for the same request. As the bio is split, we don't
get to see the -EAGAIN until one of the actual reads complete, and hence
we cannot handle it inline as part of submission.

This does potentially cause re-reads of parts of the range, as the whole
request is reissued. There's currently no better way to handle this.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 148 ++++++++++++++++++++++++++++++++++++++++++--------
 1 file changed, 124 insertions(+), 24 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index c5ee6d1a92d3..f3dbf83fabf3 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -900,6 +900,13 @@ static int io_file_get(struct io_submit_state *state, struct io_kiocb *req,
 static void __io_queue_sqe(struct io_kiocb *req,
 			   const struct io_uring_sqe *sqe);
 
+static ssize_t io_import_iovec(int rw, struct io_kiocb *req,
+			       struct iovec **iovec, struct iov_iter *iter,
+			       bool needs_lock);
+static int io_setup_async_rw(struct io_kiocb *req, ssize_t io_size,
+			     struct iovec *iovec, struct iovec *fast_iov,
+			     struct iov_iter *iter);
+
 static struct kmem_cache *req_cachep;
 
 static const struct file_operations io_uring_fops;
@@ -1978,12 +1985,115 @@ static void io_complete_rw_common(struct kiocb *kiocb, long res)
 	__io_cqring_add_event(req, res, cflags);
 }
 
+static void io_sq_thread_drop_mm(struct io_ring_ctx *ctx)
+{
+	struct mm_struct *mm = current->mm;
+
+	if (mm) {
+		kthread_unuse_mm(mm);
+		mmput(mm);
+	}
+}
+
+static int io_sq_thread_acquire_mm(struct io_ring_ctx *ctx,
+				   struct io_kiocb *req)
+{
+	if (io_op_defs[req->opcode].needs_mm && !current->mm) {
+		if (unlikely(!mmget_not_zero(ctx->sqo_mm)))
+			return -EFAULT;
+		kthread_use_mm(ctx->sqo_mm);
+	}
+
+	return 0;
+}
+
+#ifdef CONFIG_BLOCK
+static bool io_resubmit_prep(struct io_kiocb *req, int error)
+{
+	struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs;
+	ssize_t ret = -ECANCELED;
+	struct iov_iter iter;
+	int rw;
+
+	if (error) {
+		ret = error;
+		goto end_req;
+	}
+
+	switch (req->opcode) {
+	case IORING_OP_READV:
+	case IORING_OP_READ_FIXED:
+	case IORING_OP_READ:
+		rw = READ;
+		break;
+	case IORING_OP_WRITEV:
+	case IORING_OP_WRITE_FIXED:
+	case IORING_OP_WRITE:
+		rw = WRITE;
+		break;
+	default:
+		printk_once(KERN_WARNING "io_uring: bad opcode in resubmit %d\n",
+				req->opcode);
+		goto end_req;
+	}
+
+	ret = io_import_iovec(rw, req, &iovec, &iter, false);
+	if (ret < 0)
+		goto end_req;
+	ret = io_setup_async_rw(req, ret, iovec, inline_vecs, &iter);
+	if (!ret)
+		return true;
+	kfree(iovec);
+end_req:
+	io_cqring_add_event(req, ret);
+	req_set_fail_links(req);
+	io_put_req(req);
+	return false;
+}
+
+static void io_rw_resubmit(struct callback_head *cb)
+{
+	struct io_kiocb *req = container_of(cb, struct io_kiocb, task_work);
+	struct io_ring_ctx *ctx = req->ctx;
+	int err;
+
+	__set_current_state(TASK_RUNNING);
+
+	err = io_sq_thread_acquire_mm(ctx, req);
+
+	if (io_resubmit_prep(req, err)) {
+		refcount_inc(&req->refs);
+		io_queue_async_work(req);
+	}
+}
+#endif
+
+static bool io_rw_reissue(struct io_kiocb *req, long res)
+{
+#ifdef CONFIG_BLOCK
+	struct task_struct *tsk;
+	int ret;
+
+	if ((res != -EAGAIN && res != -EOPNOTSUPP) || io_wq_current_is_worker())
+		return false;
+
+	tsk = req->task;
+	init_task_work(&req->task_work, io_rw_resubmit);
+	ret = task_work_add(tsk, &req->task_work, true);
+	if (!ret)
+		return true;
+#endif
+	return false;
+}
+
 static void io_complete_rw(struct kiocb *kiocb, long res, long res2)
 {
 	struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw.kiocb);
 
-	io_complete_rw_common(kiocb, res);
-	io_put_req(req);
+	if (!io_rw_reissue(req, res)) {
+		io_complete_rw_common(kiocb, res);
+		io_put_req(req);
+	}
 }
 
 static void io_complete_rw_iopoll(struct kiocb *kiocb, long res, long res2)
@@ -2169,6 +2279,9 @@ static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe,
 	if (kiocb->ki_flags & IOCB_NOWAIT)
 		req->flags |= REQ_F_NOWAIT;
 
+	if (kiocb->ki_flags & IOCB_DIRECT)
+		io_get_req_task(req);
+
 	if (force_nonblock)
 		kiocb->ki_flags |= IOCB_NOWAIT;
 
@@ -2668,6 +2781,7 @@ static int io_read(struct io_kiocb *req, bool force_nonblock)
 	iov_count = iov_iter_count(&iter);
 	ret = rw_verify_area(READ, req->file, &kiocb->ki_pos, iov_count);
 	if (!ret) {
+		unsigned long nr_segs = iter.nr_segs;
 		ssize_t ret2 = 0;
 
 		if (req->file->f_op->read_iter)
@@ -2679,6 +2793,8 @@ static int io_read(struct io_kiocb *req, bool force_nonblock)
 		if (!force_nonblock || (ret2 != -EAGAIN && ret2 != -EIO)) {
 			kiocb_done(kiocb, ret2);
 		} else {
+			iter.count = iov_count;
+			iter.nr_segs = nr_segs;
 copy_iov:
 			ret = io_setup_async_rw(req, io_size, iovec,
 						inline_vecs, &iter);
@@ -2765,6 +2881,7 @@ static int io_write(struct io_kiocb *req, bool force_nonblock)
 	iov_count = iov_iter_count(&iter);
 	ret = rw_verify_area(WRITE, req->file, &kiocb->ki_pos, iov_count);
 	if (!ret) {
+		unsigned long nr_segs = iter.nr_segs;
 		ssize_t ret2;
 
 		/*
@@ -2802,6 +2919,8 @@ static int io_write(struct io_kiocb *req, bool force_nonblock)
 		if (!force_nonblock || ret2 != -EAGAIN) {
 			kiocb_done(kiocb, ret2);
 		} else {
+			iter.count = iov_count;
+			iter.nr_segs = nr_segs;
 copy_iov:
 			ret = io_setup_async_rw(req, io_size, iovec,
 						inline_vecs, &iter);
@@ -4282,28 +4401,6 @@ static void io_async_queue_proc(struct file *file, struct wait_queue_head *head,
 	__io_queue_proc(&pt->req->apoll->poll, pt, head);
 }
 
-static void io_sq_thread_drop_mm(struct io_ring_ctx *ctx)
-{
-	struct mm_struct *mm = current->mm;
-
-	if (mm) {
-		kthread_unuse_mm(mm);
-		mmput(mm);
-	}
-}
-
-static int io_sq_thread_acquire_mm(struct io_ring_ctx *ctx,
-				   struct io_kiocb *req)
-{
-	if (io_op_defs[req->opcode].needs_mm && !current->mm) {
-		if (unlikely(!mmget_not_zero(ctx->sqo_mm)))
-			return -EFAULT;
-		kthread_use_mm(ctx->sqo_mm);
-	}
-
-	return 0;
-}
-
 static void io_async_task_func(struct callback_head *cb)
 {
 	struct io_kiocb *req = container_of(cb, struct io_kiocb, task_work);
@@ -5814,6 +5911,9 @@ static void io_submit_state_start(struct io_submit_state *state,
 				  unsigned int max_ios)
 {
 	blk_start_plug(&state->plug);
+#ifdef CONFIG_BLOCK
+	state->plug.nowait = true;
+#endif
 	state->free_reqs = 0;
 	state->file = NULL;
 	state->ios_left = max_ios;

From 2e85abf053b99a6488f1b529d7aa3b8d7478adae Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Fri, 22 May 2020 08:59:42 -0600
Subject: [PATCH 031/502] mm: allow read-ahead with IOCB_NOWAIT set

The read-ahead shouldn't block, so allow it to be done even if
IOCB_NOWAIT is set in the kiocb.

Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 mm/filemap.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/mm/filemap.c b/mm/filemap.c
index f0ae9a6308cb..3378d4fca883 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -2028,8 +2028,6 @@ find_page:
 
 		page = find_get_page(mapping, index);
 		if (!page) {
-			if (iocb->ki_flags & IOCB_NOWAIT)
-				goto would_block;
 			page_cache_sync_readahead(mapping,
 					ra, filp,
 					index, last_index - index);

From c7510ab2cf5ccd997fe7f194edfe09cc511abf99 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Sat, 23 May 2020 08:22:14 -0600
Subject: [PATCH 032/502] mm: abstract out wake_page_match() from
 wake_page_function()

No functional changes in this patch, just in preparation for allowing
more callers.

Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/pagemap.h | 37 +++++++++++++++++++++++++++++++++++++
 mm/filemap.c            | 35 ++++-------------------------------
 2 files changed, 41 insertions(+), 31 deletions(-)

diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index cf2468da68e9..2f18221bb5c8 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -496,6 +496,43 @@ static inline pgoff_t linear_page_index(struct vm_area_struct *vma,
 	return pgoff;
 }
 
+/* This has the same layout as wait_bit_key - see fs/cachefiles/rdwr.c */
+struct wait_page_key {
+	struct page *page;
+	int bit_nr;
+	int page_match;
+};
+
+struct wait_page_queue {
+	struct page *page;
+	int bit_nr;
+	wait_queue_entry_t wait;
+};
+
+static inline int wake_page_match(struct wait_page_queue *wait_page,
+				  struct wait_page_key *key)
+{
+	if (wait_page->page != key->page)
+	       return 0;
+	key->page_match = 1;
+
+	if (wait_page->bit_nr != key->bit_nr)
+		return 0;
+
+	/*
+	 * Stop walking if it's locked.
+	 * Is this safe if put_and_wait_on_page_locked() is in use?
+	 * Yes: the waker must hold a reference to this page, and if PG_locked
+	 * has now already been set by another task, that task must also hold
+	 * a reference to the *same usage* of this page; so there is no need
+	 * to walk on to wake even the put_and_wait_on_page_locked() callers.
+	 */
+	if (test_bit(key->bit_nr, &key->page->flags))
+		return -1;
+
+	return 1;
+}
+
 extern void __lock_page(struct page *page);
 extern int __lock_page_killable(struct page *page);
 extern int __lock_page_or_retry(struct page *page, struct mm_struct *mm,
diff --git a/mm/filemap.c b/mm/filemap.c
index 3378d4fca883..c3175dbd8fba 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -987,43 +987,16 @@ void __init pagecache_init(void)
 	page_writeback_init();
 }
 
-/* This has the same layout as wait_bit_key - see fs/cachefiles/rdwr.c */
-struct wait_page_key {
-	struct page *page;
-	int bit_nr;
-	int page_match;
-};
-
-struct wait_page_queue {
-	struct page *page;
-	int bit_nr;
-	wait_queue_entry_t wait;
-};
-
 static int wake_page_function(wait_queue_entry_t *wait, unsigned mode, int sync, void *arg)
 {
 	struct wait_page_key *key = arg;
 	struct wait_page_queue *wait_page
 		= container_of(wait, struct wait_page_queue, wait);
+	int ret;
 
-	if (wait_page->page != key->page)
-	       return 0;
-	key->page_match = 1;
-
-	if (wait_page->bit_nr != key->bit_nr)
-		return 0;
-
-	/*
-	 * Stop walking if it's locked.
-	 * Is this safe if put_and_wait_on_page_locked() is in use?
-	 * Yes: the waker must hold a reference to this page, and if PG_locked
-	 * has now already been set by another task, that task must also hold
-	 * a reference to the *same usage* of this page; so there is no need
-	 * to walk on to wake even the put_and_wait_on_page_locked() callers.
-	 */
-	if (test_bit(key->bit_nr, &key->page->flags))
-		return -1;
-
+	ret = wake_page_match(wait_page, key);
+	if (ret != 1)
+		return ret;
 	return autoremove_wake_function(wait, mode, sync, key);
 }
 

From dd3e6d5039de1cbff4e20e2b34390ff44cdb182f Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Fri, 22 May 2020 09:12:09 -0600
Subject: [PATCH 033/502] mm: add support for async page locking

Normally waiting for a page to become unlocked, or locking the page,
requires waiting for IO to complete. Add support for lock_page_async()
and wait_on_page_locked_async(), which are callback based instead. This
allows a caller to get notified when a page becomes unlocked, rather
than wait for it.

We add a new iocb field, ki_waitq, to pass in the necessary data for this
to happen. We can unionize this with ki_cookie, since that is only used
for polled IO. Polled IO can never co-exist with async callbacks, as it is
(by definition) polled completions. struct wait_page_key is made public,
and we define struct wait_page_async as the interface between the caller
and the core.

Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/fs.h      |  7 ++++++-
 include/linux/pagemap.h | 17 ++++++++++++++++
 mm/filemap.c            | 45 ++++++++++++++++++++++++++++++++++++++++-
 3 files changed, 67 insertions(+), 2 deletions(-)

diff --git a/include/linux/fs.h b/include/linux/fs.h
index 3f881a892ea7..2a5cf6080e68 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -315,6 +315,8 @@ enum rw_hint {
 #define IOCB_SYNC		(1 << 5)
 #define IOCB_WRITE		(1 << 6)
 #define IOCB_NOWAIT		(1 << 7)
+/* iocb->ki_waitq is valid */
+#define IOCB_WAITQ		(1 << 8)
 
 struct kiocb {
 	struct file		*ki_filp;
@@ -328,7 +330,10 @@ struct kiocb {
 	int			ki_flags;
 	u16			ki_hint;
 	u16			ki_ioprio; /* See linux/ioprio.h */
-	unsigned int		ki_cookie; /* for ->iopoll */
+	union {
+		unsigned int		ki_cookie; /* for ->iopoll */
+		struct wait_page_queue	*ki_waitq; /* for async buffered IO */
+	};
 
 	randomized_struct_fields_end
 };
diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index 2f18221bb5c8..e053e1d9a4d7 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -535,6 +535,7 @@ static inline int wake_page_match(struct wait_page_queue *wait_page,
 
 extern void __lock_page(struct page *page);
 extern int __lock_page_killable(struct page *page);
+extern int __lock_page_async(struct page *page, struct wait_page_queue *wait);
 extern int __lock_page_or_retry(struct page *page, struct mm_struct *mm,
 				unsigned int flags);
 extern void unlock_page(struct page *page);
@@ -571,6 +572,22 @@ static inline int lock_page_killable(struct page *page)
 	return 0;
 }
 
+/*
+ * lock_page_async - Lock the page, unless this would block. If the page
+ * is already locked, then queue a callback when the page becomes unlocked.
+ * This callback can then retry the operation.
+ *
+ * Returns 0 if the page is locked successfully, or -EIOCBQUEUED if the page
+ * was already locked and the callback defined in 'wait' was queued.
+ */
+static inline int lock_page_async(struct page *page,
+				  struct wait_page_queue *wait)
+{
+	if (!trylock_page(page))
+		return __lock_page_async(page, wait);
+	return 0;
+}
+
 /*
  * lock_page_or_retry - Lock the page, unless this would block and the
  * caller indicated that it can handle a retry.
diff --git a/mm/filemap.c b/mm/filemap.c
index c3175dbd8fba..e8aaf43bee9f 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -1180,6 +1180,36 @@ int wait_on_page_bit_killable(struct page *page, int bit_nr)
 }
 EXPORT_SYMBOL(wait_on_page_bit_killable);
 
+static int __wait_on_page_locked_async(struct page *page,
+				       struct wait_page_queue *wait, bool set)
+{
+	struct wait_queue_head *q = page_waitqueue(page);
+	int ret = 0;
+
+	wait->page = page;
+	wait->bit_nr = PG_locked;
+
+	spin_lock_irq(&q->lock);
+	__add_wait_queue_entry_tail(q, &wait->wait);
+	SetPageWaiters(page);
+	if (set)
+		ret = !trylock_page(page);
+	else
+		ret = PageLocked(page);
+	/*
+	 * If we were succesful now, we know we're still on the
+	 * waitqueue as we're still under the lock. This means it's
+	 * safe to remove and return success, we know the callback
+	 * isn't going to trigger.
+	 */
+	if (!ret)
+		__remove_wait_queue(q, &wait->wait);
+	else
+		ret = -EIOCBQUEUED;
+	spin_unlock_irq(&q->lock);
+	return ret;
+}
+
 /**
  * put_and_wait_on_page_locked - Drop a reference and wait for it to be unlocked
  * @page: The page to wait for.
@@ -1342,6 +1372,11 @@ int __lock_page_killable(struct page *__page)
 }
 EXPORT_SYMBOL_GPL(__lock_page_killable);
 
+int __lock_page_async(struct page *page, struct wait_page_queue *wait)
+{
+	return __wait_on_page_locked_async(page, wait, true);
+}
+
 /*
  * Return values:
  * 1 - page is locked; mmap_lock is still held.
@@ -2131,6 +2166,11 @@ page_not_up_to_date_locked:
 		}
 
 readpage:
+		if (iocb->ki_flags & IOCB_NOWAIT) {
+			unlock_page(page);
+			put_page(page);
+			goto would_block;
+		}
 		/*
 		 * A previous I/O error may have been due to temporary
 		 * failures, eg. multipath errors.
@@ -2150,7 +2190,10 @@ readpage:
 		}
 
 		if (!PageUptodate(page)) {
-			error = lock_page_killable(page);
+			if (iocb->ki_flags & IOCB_WAITQ)
+				error = lock_page_async(page, iocb->ki_waitq);
+			else
+				error = lock_page_killable(page);
 			if (unlikely(error))
 				goto readpage_error;
 			if (!PageUptodate(page)) {

From 1a0a7853b901c35a742b3bf176cf4701a5c5817c Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Fri, 22 May 2020 09:18:38 -0600
Subject: [PATCH 034/502] mm: support async buffered reads in
 generic_file_buffered_read()

Use the async page locking infrastructure, if IOCB_WAITQ is set in the
passed in iocb. The caller must expect an -EIOCBQUEUED return value,
which means that IO is started but not done yet. This is similar to how
O_DIRECT signals the same operation. Once the callback is received by
the caller for IO completion, the caller must retry the operation.

Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 mm/filemap.c | 38 +++++++++++++++++++++++++++-----------
 1 file changed, 27 insertions(+), 11 deletions(-)

diff --git a/mm/filemap.c b/mm/filemap.c
index e8aaf43bee9f..a5b1fa8f7ce4 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -1210,6 +1210,14 @@ static int __wait_on_page_locked_async(struct page *page,
 	return ret;
 }
 
+static int wait_on_page_locked_async(struct page *page,
+				     struct wait_page_queue *wait)
+{
+	if (!PageLocked(page))
+		return 0;
+	return __wait_on_page_locked_async(compound_head(page), wait, false);
+}
+
 /**
  * put_and_wait_on_page_locked - Drop a reference and wait for it to be unlocked
  * @page: The page to wait for.
@@ -2049,17 +2057,25 @@ find_page:
 					index, last_index - index);
 		}
 		if (!PageUptodate(page)) {
-			if (iocb->ki_flags & IOCB_NOWAIT) {
-				put_page(page);
-				goto would_block;
-			}
-
 			/*
 			 * See comment in do_read_cache_page on why
 			 * wait_on_page_locked is used to avoid unnecessarily
 			 * serialisations and why it's safe.
 			 */
-			error = wait_on_page_locked_killable(page);
+			if (iocb->ki_flags & IOCB_WAITQ) {
+				if (written) {
+					put_page(page);
+					goto out;
+				}
+				error = wait_on_page_locked_async(page,
+								iocb->ki_waitq);
+			} else {
+				if (iocb->ki_flags & IOCB_NOWAIT) {
+					put_page(page);
+					goto would_block;
+				}
+				error = wait_on_page_locked_killable(page);
+			}
 			if (unlikely(error))
 				goto readpage_error;
 			if (PageUptodate(page))
@@ -2147,7 +2163,10 @@ page_ok:
 
 page_not_up_to_date:
 		/* Get exclusive access to the page ... */
-		error = lock_page_killable(page);
+		if (iocb->ki_flags & IOCB_WAITQ)
+			error = lock_page_async(page, iocb->ki_waitq);
+		else
+			error = lock_page_killable(page);
 		if (unlikely(error))
 			goto readpage_error;
 
@@ -2190,10 +2209,7 @@ readpage:
 		}
 
 		if (!PageUptodate(page)) {
-			if (iocb->ki_flags & IOCB_WAITQ)
-				error = lock_page_async(page, iocb->ki_waitq);
-			else
-				error = lock_page_killable(page);
+			error = lock_page_killable(page);
 			if (unlikely(error))
 				goto readpage_error;
 			if (!PageUptodate(page)) {

From c2a25ec0f1005dde004cd671484f578a9c8ca7de Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Fri, 22 May 2020 09:12:51 -0600
Subject: [PATCH 035/502] fs: add FMODE_BUF_RASYNC

If set, this indicates that the file system supports IOCB_WAITQ for
buffered reads.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/fs.h | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/include/linux/fs.h b/include/linux/fs.h
index 2a5cf6080e68..4090320360f4 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -175,6 +175,9 @@ typedef int (dio_iodone_t)(struct kiocb *iocb, loff_t offset,
 /* File does not contribute to nr_files count */
 #define FMODE_NOACCOUNT		((__force fmode_t)0x20000000)
 
+/* File supports async buffered reads */
+#define FMODE_BUF_RASYNC	((__force fmode_t)0x40000000)
+
 /*
  * Flag for rw_copy_check_uvector and compat_rw_copy_check_uvector
  * that indicates that they should check the contents of the iovec are

From a304f0744824fd37d6e1aab4f9715f907724ad11 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Fri, 22 May 2020 09:14:08 -0600
Subject: [PATCH 036/502] block: flag block devices as supporting IOCB_WAITQ

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/block_dev.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/block_dev.c b/fs/block_dev.c
index 0ae656e022fd..679d9346b871 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -1851,7 +1851,7 @@ static int blkdev_open(struct inode * inode, struct file * filp)
 	 */
 	filp->f_flags |= O_LARGEFILE;
 
-	filp->f_mode |= FMODE_NOWAIT;
+	filp->f_mode |= FMODE_NOWAIT | FMODE_BUF_RASYNC;
 
 	if (filp->f_flags & O_NDELAY)
 		filp->f_mode |= FMODE_NDELAY;

From f89fb730aa02f451fba1f8d5964dfec244d2e2d1 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Fri, 22 May 2020 09:27:33 -0600
Subject: [PATCH 037/502] xfs: flag files as supporting buffered async reads

XFS uses generic_file_read_iter(), which already supports this.

Acked-by: Darrick J. Wong <darrick.wong@oracle.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/xfs/xfs_file.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index 00db81eac80d..fdbff4860d61 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -1080,7 +1080,7 @@ xfs_file_open(
 		return -EFBIG;
 	if (XFS_FORCED_SHUTDOWN(XFS_M(inode->i_sb)))
 		return -EIO;
-	file->f_mode |= FMODE_NOWAIT;
+	file->f_mode |= FMODE_NOWAIT | FMODE_BUF_RASYNC;
 	return 0;
 }
 

From 8730f12b7962b21ea9ad2756abce1e205d22db84 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Fri, 22 May 2020 10:19:22 -0600
Subject: [PATCH 038/502] btrfs: flag files as supporting buffered async reads

btrfs uses generic_file_read_iter(), which already supports this.

Acked-by: Chris Mason <clm@fb.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/btrfs/file.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 2c14312b05e8..234a418eb6da 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -3472,7 +3472,7 @@ static loff_t btrfs_file_llseek(struct file *file, loff_t offset, int whence)
 
 static int btrfs_file_open(struct inode *inode, struct file *filp)
 {
-	filp->f_mode |= FMODE_NOWAIT;
+	filp->f_mode |= FMODE_NOWAIT | FMODE_BUF_RASYNC;
 	return generic_file_open(inode, filp);
 }
 

From d1932dc3dc268f8dd5201c64971324d06ba977cc Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Fri, 22 May 2020 10:18:23 -0600
Subject: [PATCH 039/502] mm: add kiocb_wait_page_queue_init() helper

Checks if the file supports it, and initializes the values that we need.
Caller passes in 'data' pointer, if any, and the callback function to
be used.

Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/pagemap.h | 21 +++++++++++++++++++++
 1 file changed, 21 insertions(+)

diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index e053e1d9a4d7..7386bc67cc5a 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -533,6 +533,27 @@ static inline int wake_page_match(struct wait_page_queue *wait_page,
 	return 1;
 }
 
+static inline int kiocb_wait_page_queue_init(struct kiocb *kiocb,
+					     struct wait_page_queue *wait,
+					     wait_queue_func_t func,
+					     void *data)
+{
+	/* Can't support async wakeup with polled IO */
+	if (kiocb->ki_flags & IOCB_HIPRI)
+		return -EINVAL;
+	if (kiocb->ki_filp->f_mode & FMODE_BUF_RASYNC) {
+		wait->wait.func = func;
+		wait->wait.private = data;
+		wait->wait.flags = 0;
+		INIT_LIST_HEAD(&wait->wait.entry);
+		kiocb->ki_flags |= IOCB_WAITQ;
+		kiocb->ki_waitq = wait;
+		return 0;
+	}
+
+	return -EOPNOTSUPP;
+}
+
 extern void __lock_page(struct page *page);
 extern int __lock_page_killable(struct page *page);
 extern int __lock_page_async(struct page *page, struct wait_page_queue *wait);

From bcf5a06304d69a3bb194a494d87b532d5e90b01c Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Fri, 22 May 2020 09:24:42 -0600
Subject: [PATCH 040/502] io_uring: support true async buffered reads, if file
 provides it

If the file is flagged with FMODE_BUF_RASYNC, then we don't have to punt
the buffered read to an io-wq worker. Instead we can rely on page
unlocking callbacks to support retry based async IO. This is a lot more
efficient than doing async thread offload.

The retry is done similarly to how we handle poll based retry. From
the unlock callback, we simply queue the retry to a task_work based
handler.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 139 ++++++++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 135 insertions(+), 4 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index f3dbf83fabf3..5d1685e206c1 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -78,6 +78,7 @@
 #include <linux/fs_struct.h>
 #include <linux/splice.h>
 #include <linux/task_work.h>
+#include <linux/pagemap.h>
 
 #define CREATE_TRACE_POINTS
 #include <trace/events/io_uring.h>
@@ -503,6 +504,8 @@ struct io_async_rw {
 	struct iovec			*iov;
 	ssize_t				nr_segs;
 	ssize_t				size;
+	struct wait_page_queue		wpq;
+	struct callback_head		task_work;
 };
 
 struct io_async_ctx {
@@ -2750,6 +2753,126 @@ static int io_read_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe,
 	return 0;
 }
 
+static void __io_async_buf_error(struct io_kiocb *req, int error)
+{
+	struct io_ring_ctx *ctx = req->ctx;
+
+	spin_lock_irq(&ctx->completion_lock);
+	io_cqring_fill_event(req, error);
+	io_commit_cqring(ctx);
+	spin_unlock_irq(&ctx->completion_lock);
+
+	io_cqring_ev_posted(ctx);
+	req_set_fail_links(req);
+	io_double_put_req(req);
+}
+
+static void io_async_buf_cancel(struct callback_head *cb)
+{
+	struct io_async_rw *rw;
+	struct io_kiocb *req;
+
+	rw = container_of(cb, struct io_async_rw, task_work);
+	req = rw->wpq.wait.private;
+	__io_async_buf_error(req, -ECANCELED);
+}
+
+static void io_async_buf_retry(struct callback_head *cb)
+{
+	struct io_async_rw *rw;
+	struct io_ring_ctx *ctx;
+	struct io_kiocb *req;
+
+	rw = container_of(cb, struct io_async_rw, task_work);
+	req = rw->wpq.wait.private;
+	ctx = req->ctx;
+
+	__set_current_state(TASK_RUNNING);
+	if (!io_sq_thread_acquire_mm(ctx, req)) {
+		mutex_lock(&ctx->uring_lock);
+		__io_queue_sqe(req, NULL);
+		mutex_unlock(&ctx->uring_lock);
+	} else {
+		__io_async_buf_error(req, -EFAULT);
+	}
+}
+
+static int io_async_buf_func(struct wait_queue_entry *wait, unsigned mode,
+			     int sync, void *arg)
+{
+	struct wait_page_queue *wpq;
+	struct io_kiocb *req = wait->private;
+	struct io_async_rw *rw = &req->io->rw;
+	struct wait_page_key *key = arg;
+	struct task_struct *tsk;
+	int ret;
+
+	wpq = container_of(wait, struct wait_page_queue, wait);
+
+	ret = wake_page_match(wpq, key);
+	if (ret != 1)
+		return ret;
+
+	list_del_init(&wait->entry);
+
+	init_task_work(&rw->task_work, io_async_buf_retry);
+	/* submit ref gets dropped, acquire a new one */
+	refcount_inc(&req->refs);
+	tsk = req->task;
+	ret = task_work_add(tsk, &rw->task_work, true);
+	if (unlikely(ret)) {
+		/* queue just for cancelation */
+		init_task_work(&rw->task_work, io_async_buf_cancel);
+		tsk = io_wq_get_task(req->ctx->io_wq);
+		task_work_add(tsk, &rw->task_work, true);
+	}
+	wake_up_process(tsk);
+	return 1;
+}
+
+static bool io_rw_should_retry(struct io_kiocb *req)
+{
+	struct kiocb *kiocb = &req->rw.kiocb;
+	int ret;
+
+	/* never retry for NOWAIT, we just complete with -EAGAIN */
+	if (req->flags & REQ_F_NOWAIT)
+		return false;
+
+	/* already tried, or we're doing O_DIRECT */
+	if (kiocb->ki_flags & (IOCB_DIRECT | IOCB_WAITQ))
+		return false;
+	/*
+	 * just use poll if we can, and don't attempt if the fs doesn't
+	 * support callback based unlocks
+	 */
+	if (file_can_poll(req->file) || !(req->file->f_mode & FMODE_BUF_RASYNC))
+		return false;
+
+	/*
+	 * If request type doesn't require req->io to defer in general,
+	 * we need to allocate it here
+	 */
+	if (!req->io && __io_alloc_async_ctx(req))
+		return false;
+
+	ret = kiocb_wait_page_queue_init(kiocb, &req->io->rw.wpq,
+						io_async_buf_func, req);
+	if (!ret) {
+		io_get_req_task(req);
+		return true;
+	}
+
+	return false;
+}
+
+static int io_iter_do_read(struct io_kiocb *req, struct iov_iter *iter)
+{
+	if (req->file->f_op->read_iter)
+		return call_read_iter(req->file, &req->rw.kiocb, iter);
+	return loop_rw_iter(READ, req->file, &req->rw.kiocb, iter);
+}
+
 static int io_read(struct io_kiocb *req, bool force_nonblock)
 {
 	struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs;
@@ -2784,10 +2907,7 @@ static int io_read(struct io_kiocb *req, bool force_nonblock)
 		unsigned long nr_segs = iter.nr_segs;
 		ssize_t ret2 = 0;
 
-		if (req->file->f_op->read_iter)
-			ret2 = call_read_iter(req->file, kiocb, &iter);
-		else
-			ret2 = loop_rw_iter(READ, req->file, kiocb, &iter);
+		ret2 = io_iter_do_read(req, &iter);
 
 		/* Catch -EAGAIN return for forced non-blocking submission */
 		if (!force_nonblock || (ret2 != -EAGAIN && ret2 != -EIO)) {
@@ -2804,6 +2924,17 @@ copy_iov:
 			if (!(req->flags & REQ_F_NOWAIT) &&
 			    !file_can_poll(req->file))
 				req->flags |= REQ_F_MUST_PUNT;
+			/* if we can retry, do so with the callbacks armed */
+			if (io_rw_should_retry(req)) {
+				ret2 = io_iter_do_read(req, &iter);
+				if (ret2 == -EIOCBQUEUED) {
+					goto out_free;
+				} else if (ret2 != -EAGAIN) {
+					kiocb_done(kiocb, ret2);
+					goto out_free;
+				}
+			}
+			kiocb->ki_flags &= ~IOCB_WAITQ;
 			return -EAGAIN;
 		}
 	}

From 62ef73165091476d31f31e33d9d0d48b088c129d Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Sun, 21 Jun 2020 13:09:50 +0300
Subject: [PATCH 041/502] io_uring: remove setting REQ_F_MUST_PUNT in rw

io_{read,write}() {
	...
copy_iov: // prep async
  	if (!(flags & REQ_F_NOWAIT) && !file_can_poll(file))
		flags |= REQ_F_MUST_PUNT;
}

REQ_F_MUST_PUNT there is pointless, because if it happens then
REQ_F_NOWAIT is known to be _not_ set, and the request will go
async path in __io_queue_sqe() anyway. file_can_poll() check
is also repeated in arm_poll*(), so don't need it.

Remove the mentioned assignment REQ_F_MUST_PUNT in preparation
for killing the flag.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 8 --------
 1 file changed, 8 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 5d1685e206c1..13f72d2a3fec 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -2920,10 +2920,6 @@ copy_iov:
 						inline_vecs, &iter);
 			if (ret)
 				goto out_free;
-			/* any defer here is final, must blocking retry */
-			if (!(req->flags & REQ_F_NOWAIT) &&
-			    !file_can_poll(req->file))
-				req->flags |= REQ_F_MUST_PUNT;
 			/* if we can retry, do so with the callbacks armed */
 			if (io_rw_should_retry(req)) {
 				ret2 = io_iter_do_read(req, &iter);
@@ -3057,10 +3053,6 @@ copy_iov:
 						inline_vecs, &iter);
 			if (ret)
 				goto out_free;
-			/* any defer here is final, must blocking retry */
-			if (!(req->flags & REQ_F_NOWAIT) &&
-			    !file_can_poll(req->file))
-				req->flags |= REQ_F_MUST_PUNT;
 			return -EAGAIN;
 		}
 	}

From 24c74678634b3cbdb325b3b7706366c83811b311 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Sun, 21 Jun 2020 13:09:51 +0300
Subject: [PATCH 042/502] io_uring: remove REQ_F_MUST_PUNT

REQ_F_MUST_PUNT may seem looking good and clear, but it's the same
as not having REQ_F_NOWAIT set. That rather creates more confusion.
Moreover, it doesn't even affect any behaviour (e.g. see the patch
removing it from io_{read,write}).

Kill theg flag and update already outdated comments.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 22 +++++++---------------
 1 file changed, 7 insertions(+), 15 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 13f72d2a3fec..93af915a98e6 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -534,7 +534,6 @@ enum {
 	REQ_F_LINK_TIMEOUT_BIT,
 	REQ_F_TIMEOUT_BIT,
 	REQ_F_ISREG_BIT,
-	REQ_F_MUST_PUNT_BIT,
 	REQ_F_TIMEOUT_NOSEQ_BIT,
 	REQ_F_COMP_LOCKED_BIT,
 	REQ_F_NEED_CLEANUP_BIT,
@@ -582,8 +581,6 @@ enum {
 	REQ_F_TIMEOUT		= BIT(REQ_F_TIMEOUT_BIT),
 	/* regular file */
 	REQ_F_ISREG		= BIT(REQ_F_ISREG_BIT),
-	/* must be punted even for NONBLOCK */
-	REQ_F_MUST_PUNT		= BIT(REQ_F_MUST_PUNT_BIT),
 	/* no timeout sequence */
 	REQ_F_TIMEOUT_NOSEQ	= BIT(REQ_F_TIMEOUT_NOSEQ_BIT),
 	/* completion under lock */
@@ -2894,10 +2891,7 @@ static int io_read(struct io_kiocb *req, bool force_nonblock)
 	if (req->flags & REQ_F_LINK_HEAD)
 		req->result = io_size;
 
-	/*
-	 * If the file doesn't support async, mark it as REQ_F_MUST_PUNT so
-	 * we know to async punt it even if it was opened O_NONBLOCK
-	 */
+	/* If the file doesn't support async, just async punt */
 	if (force_nonblock && !io_file_supports_async(req->file, READ))
 		goto copy_iov;
 
@@ -2993,10 +2987,7 @@ static int io_write(struct io_kiocb *req, bool force_nonblock)
 	if (req->flags & REQ_F_LINK_HEAD)
 		req->result = io_size;
 
-	/*
-	 * If the file doesn't support async, mark it as REQ_F_MUST_PUNT so
-	 * we know to async punt it even if it was opened O_NONBLOCK
-	 */
+	/* If the file doesn't support async, just async punt */
 	if (force_nonblock && !io_file_supports_async(req->file, WRITE))
 		goto copy_iov;
 
@@ -3717,8 +3708,10 @@ static int io_close(struct io_kiocb *req, bool force_nonblock)
 
 	/* if the file has a flush method, be safe and punt to async */
 	if (close->put_file->f_op->flush && force_nonblock) {
+		/* was never set, but play safe */
+		req->flags &= ~REQ_F_NOWAIT;
 		/* avoid grabbing files - we don't need the files */
-		req->flags |= REQ_F_NO_FILE_TABLE | REQ_F_MUST_PUNT;
+		req->flags |= REQ_F_NO_FILE_TABLE;
 		return -EAGAIN;
 	}
 
@@ -4645,7 +4638,7 @@ static bool io_arm_poll_handler(struct io_kiocb *req)
 
 	if (!req->file || !file_can_poll(req->file))
 		return false;
-	if (req->flags & (REQ_F_MUST_PUNT | REQ_F_POLLED))
+	if (req->flags & REQ_F_POLLED)
 		return false;
 	if (!def->pollin && !def->pollout)
 		return false;
@@ -5852,8 +5845,7 @@ again:
 	 * We async punt it if the file wasn't marked NOWAIT, or if the file
 	 * doesn't support non-blocking read/write attempts
 	 */
-	if (ret == -EAGAIN && (!(req->flags & REQ_F_NOWAIT) ||
-	    (req->flags & REQ_F_MUST_PUNT))) {
+	if (ret == -EAGAIN && !(req->flags & REQ_F_NOWAIT)) {
 		if (io_arm_poll_handler(req)) {
 			if (linked_timeout)
 				io_queue_linked_timeout(linked_timeout);

From b90cd197f9315f968d5ee4e6ee9f4e3067f2c883 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Sun, 21 Jun 2020 13:09:52 +0300
Subject: [PATCH 043/502] io_uring: set @poll->file after @poll init

It's a good practice to modify fields of a struct after but not before
it was initialised. Even though io_init_poll_iocb() doesn't touch
poll->file, call it first.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 93af915a98e6..cc1f2f3b7bfa 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -4596,8 +4596,8 @@ static __poll_t __io_arm_poll_handler(struct io_kiocb *req,
 	struct io_ring_ctx *ctx = req->ctx;
 	bool cancel = false;
 
-	poll->file = req->file;
 	io_init_poll_iocb(poll, mask, wake_func);
+	poll->file = req->file;
 	poll->wait.private = req;
 
 	ipt->pt._key = mask;

From f6b6c7d6a9600bdbf5826f57137630e1670e2d87 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Sun, 21 Jun 2020 13:09:53 +0300
Subject: [PATCH 044/502] io_uring: kill NULL checks for submit state

After recent changes, io_submit_sqes() always passes valid submit state,
so kill leftovers checking it for NULL.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index cc1f2f3b7bfa..c686061c3762 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -1376,11 +1376,7 @@ static struct io_kiocb *io_alloc_req(struct io_ring_ctx *ctx,
 	gfp_t gfp = GFP_KERNEL | __GFP_NOWARN;
 	struct io_kiocb *req;
 
-	if (!state) {
-		req = kmem_cache_alloc(req_cachep, gfp);
-		if (unlikely(!req))
-			goto fallback;
-	} else if (!state->free_reqs) {
+	if (!state->free_reqs) {
 		size_t sz;
 		int ret;
 

From d3cac64c498c4fb2df46b97ee6f4c7d6d75f5e3d Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Thu, 25 Jun 2020 12:38:13 +0300
Subject: [PATCH 045/502] io_uring: fix NULL-mm for linked reqs

__io_queue_sqe() tries to handle all request of a link,
so it's not enough to grab mm in io_sq_thread_acquire_mm()
based just on the head.

Don't check req->needs_mm and do it always.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
---
 fs/io_uring.c | 15 +++++++++++----
 1 file changed, 11 insertions(+), 4 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index c686061c3762..72739188b2ff 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -1991,10 +1991,9 @@ static void io_sq_thread_drop_mm(struct io_ring_ctx *ctx)
 	}
 }
 
-static int io_sq_thread_acquire_mm(struct io_ring_ctx *ctx,
-				   struct io_kiocb *req)
+static int __io_sq_thread_acquire_mm(struct io_ring_ctx *ctx)
 {
-	if (io_op_defs[req->opcode].needs_mm && !current->mm) {
+	if (!current->mm) {
 		if (unlikely(!mmget_not_zero(ctx->sqo_mm)))
 			return -EFAULT;
 		kthread_use_mm(ctx->sqo_mm);
@@ -2003,6 +2002,14 @@ static int io_sq_thread_acquire_mm(struct io_ring_ctx *ctx,
 	return 0;
 }
 
+static int io_sq_thread_acquire_mm(struct io_ring_ctx *ctx,
+				   struct io_kiocb *req)
+{
+	if (!io_op_defs[req->opcode].needs_mm)
+		return 0;
+	return __io_sq_thread_acquire_mm(ctx);
+}
+
 #ifdef CONFIG_BLOCK
 static bool io_resubmit_prep(struct io_kiocb *req, int error)
 {
@@ -2781,7 +2788,7 @@ static void io_async_buf_retry(struct callback_head *cb)
 	ctx = req->ctx;
 
 	__set_current_state(TASK_RUNNING);
-	if (!io_sq_thread_acquire_mm(ctx, req)) {
+	if (!__io_sq_thread_acquire_mm(ctx)) {
 		mutex_lock(&ctx->uring_lock);
 		__io_queue_sqe(req, NULL);
 		mutex_unlock(&ctx->uring_lock);

From e1e16097e265daac918ce355bf1a0d1677adf0c7 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Mon, 22 Jun 2020 09:17:17 -0600
Subject: [PATCH 046/502] io_uring: provide generic io_req_complete() helper

We have lots of callers of:

io_cqring_add_event(req, result);
io_put_req(req);

Provide a helper that does this for us. It helps clean up the code, and
also provides a more convenient location for us to change the completion
handling.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 106 ++++++++++++++++++++------------------------------
 1 file changed, 43 insertions(+), 63 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 72739188b2ff..17d7bafaf8cf 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -1335,7 +1335,7 @@ static void io_cqring_fill_event(struct io_kiocb *req, long res)
 	__io_cqring_fill_event(req, res, 0);
 }
 
-static void __io_cqring_add_event(struct io_kiocb *req, long res, long cflags)
+static void io_cqring_add_event(struct io_kiocb *req, long res, long cflags)
 {
 	struct io_ring_ctx *ctx = req->ctx;
 	unsigned long flags;
@@ -1348,9 +1348,15 @@ static void __io_cqring_add_event(struct io_kiocb *req, long res, long cflags)
 	io_cqring_ev_posted(ctx);
 }
 
-static void io_cqring_add_event(struct io_kiocb *req, long res)
+static void __io_req_complete(struct io_kiocb *req, long res, unsigned cflags)
 {
-	__io_cqring_add_event(req, res, 0);
+	io_cqring_add_event(req, res, cflags);
+	io_put_req(req);
+}
+
+static void io_req_complete(struct io_kiocb *req, long res)
+{
+	__io_req_complete(req, res, 0);
 }
 
 static inline bool io_is_fallback_req(struct io_kiocb *req)
@@ -1978,7 +1984,7 @@ static void io_complete_rw_common(struct kiocb *kiocb, long res)
 		req_set_fail_links(req);
 	if (req->flags & REQ_F_BUFFER_SELECTED)
 		cflags = io_put_kbuf(req);
-	__io_cqring_add_event(req, res, cflags);
+	io_cqring_add_event(req, res, cflags);
 }
 
 static void io_sq_thread_drop_mm(struct io_ring_ctx *ctx)
@@ -2048,9 +2054,8 @@ static bool io_resubmit_prep(struct io_kiocb *req, int error)
 		return true;
 	kfree(iovec);
 end_req:
-	io_cqring_add_event(req, ret);
 	req_set_fail_links(req);
-	io_put_req(req);
+	io_req_complete(req, ret);
 	return false;
 }
 
@@ -3117,10 +3122,9 @@ static int io_tee(struct io_kiocb *req, bool force_nonblock)
 	io_put_file(req, in, (sp->flags & SPLICE_F_FD_IN_FIXED));
 	req->flags &= ~REQ_F_NEED_CLEANUP;
 
-	io_cqring_add_event(req, ret);
 	if (ret != sp->len)
 		req_set_fail_links(req);
-	io_put_req(req);
+	io_req_complete(req, ret);
 	return 0;
 }
 
@@ -3154,10 +3158,9 @@ static int io_splice(struct io_kiocb *req, bool force_nonblock)
 	io_put_file(req, in, (sp->flags & SPLICE_F_FD_IN_FIXED));
 	req->flags &= ~REQ_F_NEED_CLEANUP;
 
-	io_cqring_add_event(req, ret);
 	if (ret != sp->len)
 		req_set_fail_links(req);
-	io_put_req(req);
+	io_req_complete(req, ret);
 	return 0;
 }
 
@@ -3171,8 +3174,7 @@ static int io_nop(struct io_kiocb *req)
 	if (unlikely(ctx->flags & IORING_SETUP_IOPOLL))
 		return -EINVAL;
 
-	io_cqring_add_event(req, 0);
-	io_put_req(req);
+	io_req_complete(req, 0);
 	return 0;
 }
 
@@ -3211,8 +3213,7 @@ static int io_fsync(struct io_kiocb *req, bool force_nonblock)
 				req->sync.flags & IORING_FSYNC_DATASYNC);
 	if (ret < 0)
 		req_set_fail_links(req);
-	io_cqring_add_event(req, ret);
-	io_put_req(req);
+	io_req_complete(req, ret);
 	return 0;
 }
 
@@ -3245,8 +3246,7 @@ static int io_fallocate(struct io_kiocb *req, bool force_nonblock)
 	current->signal->rlim[RLIMIT_FSIZE].rlim_cur = RLIM_INFINITY;
 	if (ret < 0)
 		req_set_fail_links(req);
-	io_cqring_add_event(req, ret);
-	io_put_req(req);
+	io_req_complete(req, ret);
 	return 0;
 }
 
@@ -3342,8 +3342,7 @@ err:
 	req->flags &= ~REQ_F_NEED_CLEANUP;
 	if (ret < 0)
 		req_set_fail_links(req);
-	io_cqring_add_event(req, ret);
-	io_put_req(req);
+	io_req_complete(req, ret);
 	return 0;
 }
 
@@ -3416,8 +3415,7 @@ static int io_remove_buffers(struct io_kiocb *req, bool force_nonblock)
 	io_ring_submit_lock(ctx, !force_nonblock);
 	if (ret < 0)
 		req_set_fail_links(req);
-	io_cqring_add_event(req, ret);
-	io_put_req(req);
+	io_req_complete(req, ret);
 	return 0;
 }
 
@@ -3504,8 +3502,7 @@ out:
 	io_ring_submit_unlock(ctx, !force_nonblock);
 	if (ret < 0)
 		req_set_fail_links(req);
-	io_cqring_add_event(req, ret);
-	io_put_req(req);
+	io_req_complete(req, ret);
 	return 0;
 }
 
@@ -3548,8 +3545,7 @@ static int io_epoll_ctl(struct io_kiocb *req, bool force_nonblock)
 
 	if (ret < 0)
 		req_set_fail_links(req);
-	io_cqring_add_event(req, ret);
-	io_put_req(req);
+	io_req_complete(req, ret);
 	return 0;
 #else
 	return -EOPNOTSUPP;
@@ -3585,8 +3581,7 @@ static int io_madvise(struct io_kiocb *req, bool force_nonblock)
 	ret = do_madvise(ma->addr, ma->len, ma->advice);
 	if (ret < 0)
 		req_set_fail_links(req);
-	io_cqring_add_event(req, ret);
-	io_put_req(req);
+	io_req_complete(req, ret);
 	return 0;
 #else
 	return -EOPNOTSUPP;
@@ -3625,8 +3620,7 @@ static int io_fadvise(struct io_kiocb *req, bool force_nonblock)
 	ret = vfs_fadvise(req->file, fa->offset, fa->len, fa->advice);
 	if (ret < 0)
 		req_set_fail_links(req);
-	io_cqring_add_event(req, ret);
-	io_put_req(req);
+	io_req_complete(req, ret);
 	return 0;
 }
 
@@ -3665,8 +3659,7 @@ static int io_statx(struct io_kiocb *req, bool force_nonblock)
 
 	if (ret < 0)
 		req_set_fail_links(req);
-	io_cqring_add_event(req, ret);
-	io_put_req(req);
+	io_req_complete(req, ret);
 	return 0;
 }
 
@@ -3722,10 +3715,9 @@ static int io_close(struct io_kiocb *req, bool force_nonblock)
 	ret = filp_close(close->put_file, req->work.files);
 	if (ret < 0)
 		req_set_fail_links(req);
-	io_cqring_add_event(req, ret);
 	fput(close->put_file);
 	close->put_file = NULL;
-	io_put_req(req);
+	io_req_complete(req, ret);
 	return 0;
 }
 
@@ -3759,8 +3751,7 @@ static int io_sync_file_range(struct io_kiocb *req, bool force_nonblock)
 				req->sync.flags);
 	if (ret < 0)
 		req_set_fail_links(req);
-	io_cqring_add_event(req, ret);
-	io_put_req(req);
+	io_req_complete(req, ret);
 	return 0;
 }
 
@@ -3859,10 +3850,9 @@ static int io_sendmsg(struct io_kiocb *req, bool force_nonblock)
 	if (kmsg && kmsg->iov != kmsg->fast_iov)
 		kfree(kmsg->iov);
 	req->flags &= ~REQ_F_NEED_CLEANUP;
-	io_cqring_add_event(req, ret);
 	if (ret < 0)
 		req_set_fail_links(req);
-	io_put_req(req);
+	io_req_complete(req, ret);
 	return 0;
 }
 
@@ -3902,10 +3892,9 @@ static int io_send(struct io_kiocb *req, bool force_nonblock)
 			ret = -EINTR;
 	}
 
-	io_cqring_add_event(req, ret);
 	if (ret < 0)
 		req_set_fail_links(req);
-	io_put_req(req);
+	io_req_complete(req, ret);
 	return 0;
 }
 
@@ -4102,10 +4091,9 @@ static int io_recvmsg(struct io_kiocb *req, bool force_nonblock)
 	if (kmsg && kmsg->iov != kmsg->fast_iov)
 		kfree(kmsg->iov);
 	req->flags &= ~REQ_F_NEED_CLEANUP;
-	__io_cqring_add_event(req, ret, cflags);
 	if (ret < 0)
 		req_set_fail_links(req);
-	io_put_req(req);
+	__io_req_complete(req, ret, cflags);
 	return 0;
 }
 
@@ -4159,10 +4147,9 @@ static int io_recv(struct io_kiocb *req, bool force_nonblock)
 
 	kfree(kbuf);
 	req->flags &= ~REQ_F_NEED_CLEANUP;
-	__io_cqring_add_event(req, ret, cflags);
 	if (ret < 0)
 		req_set_fail_links(req);
-	io_put_req(req);
+	__io_req_complete(req, ret, cflags);
 	return 0;
 }
 
@@ -4201,8 +4188,7 @@ static int io_accept(struct io_kiocb *req, bool force_nonblock)
 			ret = -EINTR;
 		req_set_fail_links(req);
 	}
-	io_cqring_add_event(req, ret);
-	io_put_req(req);
+	io_req_complete(req, ret);
 	return 0;
 }
 
@@ -4262,8 +4248,7 @@ static int io_connect(struct io_kiocb *req, bool force_nonblock)
 out:
 	if (ret < 0)
 		req_set_fail_links(req);
-	io_cqring_add_event(req, ret);
-	io_put_req(req);
+	io_req_complete(req, ret);
 	return 0;
 }
 #else /* !CONFIG_NET */
@@ -4555,7 +4540,7 @@ static void io_async_task_func(struct callback_head *cb)
 	if (!canceled) {
 		__set_current_state(TASK_RUNNING);
 		if (io_sq_thread_acquire_mm(ctx, req)) {
-			io_cqring_add_event(req, -EFAULT);
+			io_cqring_add_event(req, -EFAULT, 0);
 			goto end_req;
 		}
 		mutex_lock(&ctx->uring_lock);
@@ -4804,10 +4789,9 @@ static int io_poll_remove(struct io_kiocb *req)
 	ret = io_poll_cancel(ctx, addr);
 	spin_unlock_irq(&ctx->completion_lock);
 
-	io_cqring_add_event(req, ret);
 	if (ret < 0)
 		req_set_fail_links(req);
-	io_put_req(req);
+	io_req_complete(req, ret);
 	return 0;
 }
 
@@ -5163,8 +5147,7 @@ static int io_files_update(struct io_kiocb *req, bool force_nonblock)
 
 	if (ret < 0)
 		req_set_fail_links(req);
-	io_cqring_add_event(req, ret);
-	io_put_req(req);
+	io_req_complete(req, ret);
 	return 0;
 }
 
@@ -5657,8 +5640,7 @@ static void io_wq_submit_work(struct io_wq_work **workptr)
 
 	if (ret) {
 		req_set_fail_links(req);
-		io_cqring_add_event(req, ret);
-		io_put_req(req);
+		io_req_complete(req, ret);
 	}
 
 	io_steal_work(req, workptr);
@@ -5775,8 +5757,7 @@ static enum hrtimer_restart io_link_timeout_fn(struct hrtimer *timer)
 		io_async_find_and_cancel(ctx, req, prev->user_data, -ETIME);
 		io_put_req(prev);
 	} else {
-		io_cqring_add_event(req, -ETIME);
-		io_put_req(req);
+		io_req_complete(req, -ETIME);
 	}
 	return HRTIMER_NORESTART;
 }
@@ -5885,9 +5866,8 @@ err:
 
 	/* and drop final reference, if we failed */
 	if (ret) {
-		io_cqring_add_event(req, ret);
 		req_set_fail_links(req);
-		io_put_req(req);
+		io_req_complete(req, ret);
 	}
 	if (nxt) {
 		req = nxt;
@@ -5909,9 +5889,9 @@ static void io_queue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 	if (ret) {
 		if (ret != -EIOCBQUEUED) {
 fail_req:
-			io_cqring_add_event(req, ret);
 			req_set_fail_links(req);
-			io_double_put_req(req);
+			io_put_req(req);
+			io_req_complete(req, ret);
 		}
 	} else if (req->flags & REQ_F_FORCE_ASYNC) {
 		if (!req->io) {
@@ -5937,8 +5917,8 @@ fail_req:
 static inline void io_queue_link_head(struct io_kiocb *req)
 {
 	if (unlikely(req->flags & REQ_F_FAIL_LINK)) {
-		io_cqring_add_event(req, -ECANCELED);
-		io_double_put_req(req);
+		io_put_req(req);
+		io_req_complete(req, -ECANCELED);
 	} else
 		io_queue_sqe(req, NULL);
 }
@@ -6195,8 +6175,8 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr,
 
 		if (unlikely(err)) {
 fail_req:
-			io_cqring_add_event(req, err);
-			io_double_put_req(req);
+			io_put_req(req);
+			io_req_complete(req, err);
 			break;
 		}
 

From 013538bd65fd3cdbf3ca8b0c99b962c70473c803 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Mon, 22 Jun 2020 09:29:15 -0600
Subject: [PATCH 047/502] io_uring: add 'io_comp_state' to struct
 io_submit_state

No functional changes in this patch, just in preparation for passing back
pending completions to the caller and completing them in a batched
fashion.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 18 ++++++++++++++++--
 1 file changed, 16 insertions(+), 2 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 17d7bafaf8cf..002ab5eae20f 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -678,6 +678,12 @@ struct io_kiocb {
 
 #define IO_IOPOLL_BATCH			8
 
+struct io_comp_state {
+	unsigned int		nr;
+	struct list_head	list;
+	struct io_ring_ctx	*ctx;
+};
+
 struct io_submit_state {
 	struct blk_plug		plug;
 
@@ -687,6 +693,11 @@ struct io_submit_state {
 	void			*reqs[IO_IOPOLL_BATCH];
 	unsigned int		free_reqs;
 
+	/*
+	 * Batch completion logic
+	 */
+	struct io_comp_state	comp;
+
 	/*
 	 * File reference cache
 	 */
@@ -6006,12 +6017,15 @@ static void io_submit_state_end(struct io_submit_state *state)
  * Start submission side cache.
  */
 static void io_submit_state_start(struct io_submit_state *state,
-				  unsigned int max_ios)
+				  struct io_ring_ctx *ctx, unsigned int max_ios)
 {
 	blk_start_plug(&state->plug);
 #ifdef CONFIG_BLOCK
 	state->plug.nowait = true;
 #endif
+	state->comp.nr = 0;
+	INIT_LIST_HEAD(&state->comp.list);
+	state->comp.ctx = ctx;
 	state->free_reqs = 0;
 	state->file = NULL;
 	state->ios_left = max_ios;
@@ -6146,7 +6160,7 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr,
 	if (!percpu_ref_tryget_many(&ctx->refs, nr))
 		return -EAGAIN;
 
-	io_submit_state_start(&state, nr);
+	io_submit_state_start(&state, ctx, nr);
 
 	ctx->ring_fd = ring_fd;
 	ctx->ring_file = ring_file;

From f13fad7ba41cef806358885fbb3f9004f3214b2d Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Mon, 22 Jun 2020 09:34:30 -0600
Subject: [PATCH 048/502] io_uring: pass down completion state on the issue
 side

No functional changes in this patch, just in preparation for having the
completion state be available on the issue side. Later on, this will
allow requests that complete inline to be completed in batches.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 67 ++++++++++++++++++++++++++++++++++++++-------------
 1 file changed, 50 insertions(+), 17 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 002ab5eae20f..46241c1ad1b8 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -909,7 +909,8 @@ static void io_cleanup_req(struct io_kiocb *req);
 static int io_file_get(struct io_submit_state *state, struct io_kiocb *req,
 		       int fd, struct file **out_file, bool fixed);
 static void __io_queue_sqe(struct io_kiocb *req,
-			   const struct io_uring_sqe *sqe);
+			   const struct io_uring_sqe *sqe,
+			   struct io_comp_state *cs);
 
 static ssize_t io_import_iovec(int rw, struct io_kiocb *req,
 			       struct iovec **iovec, struct iov_iter *iter,
@@ -2806,7 +2807,7 @@ static void io_async_buf_retry(struct callback_head *cb)
 	__set_current_state(TASK_RUNNING);
 	if (!__io_sq_thread_acquire_mm(ctx)) {
 		mutex_lock(&ctx->uring_lock);
-		__io_queue_sqe(req, NULL);
+		__io_queue_sqe(req, NULL, NULL);
 		mutex_unlock(&ctx->uring_lock);
 	} else {
 		__io_async_buf_error(req, -EFAULT);
@@ -4430,7 +4431,7 @@ static void io_poll_task_func(struct callback_head *cb)
 		struct io_ring_ctx *ctx = nxt->ctx;
 
 		mutex_lock(&ctx->uring_lock);
-		__io_queue_sqe(nxt, NULL);
+		__io_queue_sqe(nxt, NULL, NULL);
 		mutex_unlock(&ctx->uring_lock);
 	}
 }
@@ -4555,7 +4556,7 @@ static void io_async_task_func(struct callback_head *cb)
 			goto end_req;
 		}
 		mutex_lock(&ctx->uring_lock);
-		__io_queue_sqe(req, NULL);
+		__io_queue_sqe(req, NULL, NULL);
 		mutex_unlock(&ctx->uring_lock);
 	} else {
 		io_cqring_ev_posted(ctx);
@@ -5352,7 +5353,7 @@ static void io_cleanup_req(struct io_kiocb *req)
 }
 
 static int io_issue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe,
-			bool force_nonblock)
+			bool force_nonblock, struct io_comp_state *cs)
 {
 	struct io_ring_ctx *ctx = req->ctx;
 	int ret;
@@ -5637,7 +5638,7 @@ static void io_wq_submit_work(struct io_wq_work **workptr)
 
 	if (!ret) {
 		do {
-			ret = io_issue_sqe(req, NULL, false);
+			ret = io_issue_sqe(req, NULL, false, NULL);
 			/*
 			 * We can get EAGAIN for polled IO even though we're
 			 * forcing a sync submission from here, since we can't
@@ -5814,7 +5815,8 @@ static struct io_kiocb *io_prep_linked_timeout(struct io_kiocb *req)
 	return nxt;
 }
 
-static void __io_queue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe)
+static void __io_queue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe,
+			   struct io_comp_state *cs)
 {
 	struct io_kiocb *linked_timeout;
 	struct io_kiocb *nxt;
@@ -5834,7 +5836,7 @@ again:
 			old_creds = override_creds(req->work.creds);
 	}
 
-	ret = io_issue_sqe(req, sqe, true);
+	ret = io_issue_sqe(req, sqe, true, cs);
 
 	/*
 	 * We async punt it if the file wasn't marked NOWAIT, or if the file
@@ -5892,7 +5894,8 @@ exit:
 		revert_creds(old_creds);
 }
 
-static void io_queue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe)
+static void io_queue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe,
+			 struct io_comp_state *cs)
 {
 	int ret;
 
@@ -5921,21 +5924,22 @@ fail_req:
 		req->work.flags |= IO_WQ_WORK_CONCURRENT;
 		io_queue_async_work(req);
 	} else {
-		__io_queue_sqe(req, sqe);
+		__io_queue_sqe(req, sqe, cs);
 	}
 }
 
-static inline void io_queue_link_head(struct io_kiocb *req)
+static inline void io_queue_link_head(struct io_kiocb *req,
+				      struct io_comp_state *cs)
 {
 	if (unlikely(req->flags & REQ_F_FAIL_LINK)) {
 		io_put_req(req);
 		io_req_complete(req, -ECANCELED);
 	} else
-		io_queue_sqe(req, NULL);
+		io_queue_sqe(req, NULL, cs);
 }
 
 static int io_submit_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe,
-			 struct io_kiocb **link)
+			 struct io_kiocb **link, struct io_comp_state *cs)
 {
 	struct io_ring_ctx *ctx = req->ctx;
 	int ret;
@@ -5975,7 +5979,7 @@ static int io_submit_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe,
 
 		/* last request of a link, enqueue the link */
 		if (!(req->flags & (REQ_F_LINK | REQ_F_HARDLINK))) {
-			io_queue_link_head(head);
+			io_queue_link_head(head, cs);
 			*link = NULL;
 		}
 	} else {
@@ -5995,18 +5999,47 @@ static int io_submit_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe,
 				req->flags |= REQ_F_FAIL_LINK;
 			*link = req;
 		} else {
-			io_queue_sqe(req, sqe);
+			io_queue_sqe(req, sqe, cs);
 		}
 	}
 
 	return 0;
 }
 
+static void io_submit_flush_completions(struct io_comp_state *cs)
+{
+	struct io_ring_ctx *ctx = cs->ctx;
+
+	spin_lock_irq(&ctx->completion_lock);
+	while (!list_empty(&cs->list)) {
+		struct io_kiocb *req;
+
+		req = list_first_entry(&cs->list, struct io_kiocb, list);
+		list_del(&req->list);
+		io_cqring_fill_event(req, req->result);
+		if (!(req->flags & REQ_F_LINK_HEAD)) {
+			req->flags |= REQ_F_COMP_LOCKED;
+			io_put_req(req);
+		} else {
+			spin_unlock_irq(&ctx->completion_lock);
+			io_put_req(req);
+			spin_lock_irq(&ctx->completion_lock);
+		}
+	}
+	io_commit_cqring(ctx);
+	spin_unlock_irq(&ctx->completion_lock);
+
+	io_cqring_ev_posted(ctx);
+	cs->nr = 0;
+}
+
 /*
  * Batched submission is done, ensure local IO is flushed out.
  */
 static void io_submit_state_end(struct io_submit_state *state)
 {
+	if (!list_empty(&state->comp.list))
+		io_submit_flush_completions(&state->comp);
 	blk_finish_plug(&state->plug);
 	io_state_file_put(state);
 	if (state->free_reqs)
@@ -6196,7 +6229,7 @@ fail_req:
 
 		trace_io_uring_submit_sqe(ctx, req->opcode, req->user_data,
 						true, io_async_submit(ctx));
-		err = io_submit_sqe(req, sqe, &link);
+		err = io_submit_sqe(req, sqe, &link, &state.comp);
 		if (err)
 			goto fail_req;
 	}
@@ -6207,7 +6240,7 @@ fail_req:
 		percpu_ref_put_many(&ctx->refs, nr - ref_used);
 	}
 	if (link)
-		io_queue_link_head(link);
+		io_queue_link_head(link, &state.comp);
 	io_submit_state_end(&state);
 
 	 /* Commit SQ ring head once we've consumed and submitted all SQEs */

From 229a7b63507a3e84afb17c3bbb67505a81d28a1d Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Mon, 22 Jun 2020 10:13:11 -0600
Subject: [PATCH 049/502] io_uring: pass in completion state to appropriate
 issue side handlers

Provide the completion state to the handlers that we know can complete
inline, so they can utilize this for batching completions.

Cap the max batch count at 32. This should be enough to provide a good
amortization of the cost of the lock+commit dance for completions, while
still being low enough not to cause any real latency issues for SQPOLL
applications.

Xuan Zhuo <xuanzhuo@linux.alibaba.com> reports that this changes his
profile from:

17.97% [kernel] [k] copy_user_generic_unrolled
13.92% [kernel] [k] io_commit_cqring
11.04% [kernel] [k] __io_cqring_fill_event
10.33% [kernel] [k] udp_recvmsg
 5.94% [kernel] [k] skb_release_data
 4.31% [kernel] [k] udp_rmem_release
 2.68% [kernel] [k] __check_object_size
 2.24% [kernel] [k] __slab_free
 2.22% [kernel] [k] _raw_spin_lock_bh
 2.21% [kernel] [k] kmem_cache_free
 2.13% [kernel] [k] free_pcppages_bulk
 1.83% [kernel] [k] io_submit_sqes
 1.38% [kernel] [k] page_frag_free
 1.31% [kernel] [k] inet_recvmsg

to

19.99% [kernel] [k] copy_user_generic_unrolled
11.63% [kernel] [k] skb_release_data
 9.36% [kernel] [k] udp_rmem_release
 8.64% [kernel] [k] udp_recvmsg
 6.21% [kernel] [k] __slab_free
 4.39% [kernel] [k] __check_object_size
 3.64% [kernel] [k] free_pcppages_bulk
 2.41% [kernel] [k] kmem_cache_free
 2.00% [kernel] [k] io_submit_sqes
 1.95% [kernel] [k] page_frag_free
 1.54% [kernel] [k] io_put_req
[...]
 0.07% [kernel] [k] io_commit_cqring
 0.44% [kernel] [k] __io_cqring_fill_event

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 153 ++++++++++++++++++++++++++++----------------------
 1 file changed, 86 insertions(+), 67 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 46241c1ad1b8..6c9ca4fcbc31 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -1360,15 +1360,50 @@ static void io_cqring_add_event(struct io_kiocb *req, long res, long cflags)
 	io_cqring_ev_posted(ctx);
 }
 
-static void __io_req_complete(struct io_kiocb *req, long res, unsigned cflags)
+static void io_submit_flush_completions(struct io_comp_state *cs)
 {
-	io_cqring_add_event(req, res, cflags);
-	io_put_req(req);
+	struct io_ring_ctx *ctx = cs->ctx;
+
+	spin_lock_irq(&ctx->completion_lock);
+	while (!list_empty(&cs->list)) {
+		struct io_kiocb *req;
+
+		req = list_first_entry(&cs->list, struct io_kiocb, list);
+		list_del(&req->list);
+		io_cqring_fill_event(req, req->result);
+		if (!(req->flags & REQ_F_LINK_HEAD)) {
+			req->flags |= REQ_F_COMP_LOCKED;
+			io_put_req(req);
+		} else {
+			spin_unlock_irq(&ctx->completion_lock);
+			io_put_req(req);
+			spin_lock_irq(&ctx->completion_lock);
+		}
+	}
+	io_commit_cqring(ctx);
+	spin_unlock_irq(&ctx->completion_lock);
+
+	io_cqring_ev_posted(ctx);
+	cs->nr = 0;
+}
+
+static void __io_req_complete(struct io_kiocb *req, long res, unsigned cflags,
+			      struct io_comp_state *cs)
+{
+	if (!cs) {
+		io_cqring_add_event(req, res, cflags);
+		io_put_req(req);
+	} else {
+		req->result = res;
+		list_add_tail(&req->list, &cs->list);
+		if (++cs->nr >= 32)
+			io_submit_flush_completions(cs);
+	}
 }
 
 static void io_req_complete(struct io_kiocb *req, long res)
 {
-	__io_req_complete(req, res, 0);
+	__io_req_complete(req, res, 0, NULL);
 }
 
 static inline bool io_is_fallback_req(struct io_kiocb *req)
@@ -3179,14 +3214,14 @@ static int io_splice(struct io_kiocb *req, bool force_nonblock)
 /*
  * IORING_OP_NOP just posts a completion event, nothing else.
  */
-static int io_nop(struct io_kiocb *req)
+static int io_nop(struct io_kiocb *req, struct io_comp_state *cs)
 {
 	struct io_ring_ctx *ctx = req->ctx;
 
 	if (unlikely(ctx->flags & IORING_SETUP_IOPOLL))
 		return -EINVAL;
 
-	io_req_complete(req, 0);
+	__io_req_complete(req, 0, 0, cs);
 	return 0;
 }
 
@@ -3408,7 +3443,8 @@ static int __io_remove_buffers(struct io_ring_ctx *ctx, struct io_buffer *buf,
 	return i;
 }
 
-static int io_remove_buffers(struct io_kiocb *req, bool force_nonblock)
+static int io_remove_buffers(struct io_kiocb *req, bool force_nonblock,
+			     struct io_comp_state *cs)
 {
 	struct io_provide_buf *p = &req->pbuf;
 	struct io_ring_ctx *ctx = req->ctx;
@@ -3427,7 +3463,7 @@ static int io_remove_buffers(struct io_kiocb *req, bool force_nonblock)
 	io_ring_submit_lock(ctx, !force_nonblock);
 	if (ret < 0)
 		req_set_fail_links(req);
-	io_req_complete(req, ret);
+	__io_req_complete(req, ret, 0, cs);
 	return 0;
 }
 
@@ -3485,7 +3521,8 @@ static int io_add_buffers(struct io_provide_buf *pbuf, struct io_buffer **head)
 	return i ? i : -ENOMEM;
 }
 
-static int io_provide_buffers(struct io_kiocb *req, bool force_nonblock)
+static int io_provide_buffers(struct io_kiocb *req, bool force_nonblock,
+			      struct io_comp_state *cs)
 {
 	struct io_provide_buf *p = &req->pbuf;
 	struct io_ring_ctx *ctx = req->ctx;
@@ -3514,7 +3551,7 @@ out:
 	io_ring_submit_unlock(ctx, !force_nonblock);
 	if (ret < 0)
 		req_set_fail_links(req);
-	io_req_complete(req, ret);
+	__io_req_complete(req, ret, 0, cs);
 	return 0;
 }
 
@@ -3545,7 +3582,8 @@ static int io_epoll_ctl_prep(struct io_kiocb *req,
 #endif
 }
 
-static int io_epoll_ctl(struct io_kiocb *req, bool force_nonblock)
+static int io_epoll_ctl(struct io_kiocb *req, bool force_nonblock,
+			struct io_comp_state *cs)
 {
 #if defined(CONFIG_EPOLL)
 	struct io_epoll *ie = &req->epoll;
@@ -3557,7 +3595,7 @@ static int io_epoll_ctl(struct io_kiocb *req, bool force_nonblock)
 
 	if (ret < 0)
 		req_set_fail_links(req);
-	io_req_complete(req, ret);
+	__io_req_complete(req, ret, 0, cs);
 	return 0;
 #else
 	return -EOPNOTSUPP;
@@ -3702,7 +3740,8 @@ static int io_close_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 	return 0;
 }
 
-static int io_close(struct io_kiocb *req, bool force_nonblock)
+static int io_close(struct io_kiocb *req, bool force_nonblock,
+		    struct io_comp_state *cs)
 {
 	struct io_close *close = &req->close;
 	int ret;
@@ -3729,7 +3768,7 @@ static int io_close(struct io_kiocb *req, bool force_nonblock)
 		req_set_fail_links(req);
 	fput(close->put_file);
 	close->put_file = NULL;
-	io_req_complete(req, ret);
+	__io_req_complete(req, ret, 0, cs);
 	return 0;
 }
 
@@ -3815,7 +3854,8 @@ static int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 	return ret;
 }
 
-static int io_sendmsg(struct io_kiocb *req, bool force_nonblock)
+static int io_sendmsg(struct io_kiocb *req, bool force_nonblock,
+		      struct io_comp_state *cs)
 {
 	struct io_async_msghdr *kmsg = NULL;
 	struct socket *sock;
@@ -3864,11 +3904,12 @@ static int io_sendmsg(struct io_kiocb *req, bool force_nonblock)
 	req->flags &= ~REQ_F_NEED_CLEANUP;
 	if (ret < 0)
 		req_set_fail_links(req);
-	io_req_complete(req, ret);
+	__io_req_complete(req, ret, 0, cs);
 	return 0;
 }
 
-static int io_send(struct io_kiocb *req, bool force_nonblock)
+static int io_send(struct io_kiocb *req, bool force_nonblock,
+		   struct io_comp_state *cs)
 {
 	struct socket *sock;
 	int ret;
@@ -3906,7 +3947,7 @@ static int io_send(struct io_kiocb *req, bool force_nonblock)
 
 	if (ret < 0)
 		req_set_fail_links(req);
-	io_req_complete(req, ret);
+	__io_req_complete(req, ret, 0, cs);
 	return 0;
 }
 
@@ -4049,7 +4090,8 @@ static int io_recvmsg_prep(struct io_kiocb *req,
 	return ret;
 }
 
-static int io_recvmsg(struct io_kiocb *req, bool force_nonblock)
+static int io_recvmsg(struct io_kiocb *req, bool force_nonblock,
+		      struct io_comp_state *cs)
 {
 	struct io_async_msghdr *kmsg = NULL;
 	struct socket *sock;
@@ -4105,11 +4147,12 @@ static int io_recvmsg(struct io_kiocb *req, bool force_nonblock)
 	req->flags &= ~REQ_F_NEED_CLEANUP;
 	if (ret < 0)
 		req_set_fail_links(req);
-	__io_req_complete(req, ret, cflags);
+	__io_req_complete(req, ret, cflags, cs);
 	return 0;
 }
 
-static int io_recv(struct io_kiocb *req, bool force_nonblock)
+static int io_recv(struct io_kiocb *req, bool force_nonblock,
+		   struct io_comp_state *cs)
 {
 	struct io_buffer *kbuf = NULL;
 	struct socket *sock;
@@ -4161,7 +4204,7 @@ static int io_recv(struct io_kiocb *req, bool force_nonblock)
 	req->flags &= ~REQ_F_NEED_CLEANUP;
 	if (ret < 0)
 		req_set_fail_links(req);
-	__io_req_complete(req, ret, cflags);
+	__io_req_complete(req, ret, cflags, cs);
 	return 0;
 }
 
@@ -4181,7 +4224,8 @@ static int io_accept_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 	return 0;
 }
 
-static int io_accept(struct io_kiocb *req, bool force_nonblock)
+static int io_accept(struct io_kiocb *req, bool force_nonblock,
+		     struct io_comp_state *cs)
 {
 	struct io_accept *accept = &req->accept;
 	unsigned int file_flags = force_nonblock ? O_NONBLOCK : 0;
@@ -4200,7 +4244,7 @@ static int io_accept(struct io_kiocb *req, bool force_nonblock)
 			ret = -EINTR;
 		req_set_fail_links(req);
 	}
-	io_req_complete(req, ret);
+	__io_req_complete(req, ret, 0, cs);
 	return 0;
 }
 
@@ -4224,7 +4268,8 @@ static int io_connect_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 					&io->connect.address);
 }
 
-static int io_connect(struct io_kiocb *req, bool force_nonblock)
+static int io_connect(struct io_kiocb *req, bool force_nonblock,
+		      struct io_comp_state *cs)
 {
 	struct io_async_ctx __io, *io;
 	unsigned file_flags;
@@ -4260,7 +4305,7 @@ static int io_connect(struct io_kiocb *req, bool force_nonblock)
 out:
 	if (ret < 0)
 		req_set_fail_links(req);
-	io_req_complete(req, ret);
+	__io_req_complete(req, ret, 0, cs);
 	return 0;
 }
 #else /* !CONFIG_NET */
@@ -5141,7 +5186,8 @@ static int io_files_update_prep(struct io_kiocb *req,
 	return 0;
 }
 
-static int io_files_update(struct io_kiocb *req, bool force_nonblock)
+static int io_files_update(struct io_kiocb *req, bool force_nonblock,
+			   struct io_comp_state *cs)
 {
 	struct io_ring_ctx *ctx = req->ctx;
 	struct io_uring_files_update up;
@@ -5159,7 +5205,7 @@ static int io_files_update(struct io_kiocb *req, bool force_nonblock)
 
 	if (ret < 0)
 		req_set_fail_links(req);
-	io_req_complete(req, ret);
+	__io_req_complete(req, ret, 0, cs);
 	return 0;
 }
 
@@ -5360,7 +5406,7 @@ static int io_issue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe,
 
 	switch (req->opcode) {
 	case IORING_OP_NOP:
-		ret = io_nop(req);
+		ret = io_nop(req, cs);
 		break;
 	case IORING_OP_READV:
 	case IORING_OP_READ_FIXED:
@@ -5422,9 +5468,9 @@ static int io_issue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe,
 				break;
 		}
 		if (req->opcode == IORING_OP_SENDMSG)
-			ret = io_sendmsg(req, force_nonblock);
+			ret = io_sendmsg(req, force_nonblock, cs);
 		else
-			ret = io_send(req, force_nonblock);
+			ret = io_send(req, force_nonblock, cs);
 		break;
 	case IORING_OP_RECVMSG:
 	case IORING_OP_RECV:
@@ -5434,9 +5480,9 @@ static int io_issue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe,
 				break;
 		}
 		if (req->opcode == IORING_OP_RECVMSG)
-			ret = io_recvmsg(req, force_nonblock);
+			ret = io_recvmsg(req, force_nonblock, cs);
 		else
-			ret = io_recv(req, force_nonblock);
+			ret = io_recv(req, force_nonblock, cs);
 		break;
 	case IORING_OP_TIMEOUT:
 		if (sqe) {
@@ -5460,7 +5506,7 @@ static int io_issue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe,
 			if (ret)
 				break;
 		}
-		ret = io_accept(req, force_nonblock);
+		ret = io_accept(req, force_nonblock, cs);
 		break;
 	case IORING_OP_CONNECT:
 		if (sqe) {
@@ -5468,7 +5514,7 @@ static int io_issue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe,
 			if (ret)
 				break;
 		}
-		ret = io_connect(req, force_nonblock);
+		ret = io_connect(req, force_nonblock, cs);
 		break;
 	case IORING_OP_ASYNC_CANCEL:
 		if (sqe) {
@@ -5500,7 +5546,7 @@ static int io_issue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe,
 			if (ret)
 				break;
 		}
-		ret = io_close(req, force_nonblock);
+		ret = io_close(req, force_nonblock, cs);
 		break;
 	case IORING_OP_FILES_UPDATE:
 		if (sqe) {
@@ -5508,7 +5554,7 @@ static int io_issue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe,
 			if (ret)
 				break;
 		}
-		ret = io_files_update(req, force_nonblock);
+		ret = io_files_update(req, force_nonblock, cs);
 		break;
 	case IORING_OP_STATX:
 		if (sqe) {
@@ -5548,7 +5594,7 @@ static int io_issue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe,
 			if (ret)
 				break;
 		}
-		ret = io_epoll_ctl(req, force_nonblock);
+		ret = io_epoll_ctl(req, force_nonblock, cs);
 		break;
 	case IORING_OP_SPLICE:
 		if (sqe) {
@@ -5564,7 +5610,7 @@ static int io_issue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe,
 			if (ret)
 				break;
 		}
-		ret = io_provide_buffers(req, force_nonblock);
+		ret = io_provide_buffers(req, force_nonblock, cs);
 		break;
 	case IORING_OP_REMOVE_BUFFERS:
 		if (sqe) {
@@ -5572,7 +5618,7 @@ static int io_issue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe,
 			if (ret)
 				break;
 		}
-		ret = io_remove_buffers(req, force_nonblock);
+		ret = io_remove_buffers(req, force_nonblock, cs);
 		break;
 	case IORING_OP_TEE:
 		if (sqe) {
@@ -6006,33 +6052,6 @@ static int io_submit_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe,
 	return 0;
 }
 
-static void io_submit_flush_completions(struct io_comp_state *cs)
-{
-	struct io_ring_ctx *ctx = cs->ctx;
-
-	spin_lock_irq(&ctx->completion_lock);
-	while (!list_empty(&cs->list)) {
-		struct io_kiocb *req;
-
-		req = list_first_entry(&cs->list, struct io_kiocb, list);
-		list_del(&req->list);
-		io_cqring_fill_event(req, req->result);
-		if (!(req->flags & REQ_F_LINK_HEAD)) {
-			req->flags |= REQ_F_COMP_LOCKED;
-			io_put_req(req);
-		} else {
-			spin_unlock_irq(&ctx->completion_lock);
-			io_put_req(req);
-			spin_lock_irq(&ctx->completion_lock);
-		}
-	}
-	io_commit_cqring(ctx);
-	spin_unlock_irq(&ctx->completion_lock);
-
-	io_cqring_ev_posted(ctx);
-	cs->nr = 0;
-}
-
 /*
  * Batched submission is done, ensure local IO is flushed out.
  */

From a1d7c393c4711a9ce6c239c3ab053a50dc96505a Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Mon, 22 Jun 2020 11:09:46 -0600
Subject: [PATCH 050/502] io_uring: enable READ/WRITE to use deferred
 completions

A bit more surgery required here, as completions are generally done
through the kiocb->ki_complete() callback, even if they complete inline.
This enables the regular read/write path to use the io_comp_state
logic to batch inline completions.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 38 +++++++++++++++++++++++---------------
 1 file changed, 23 insertions(+), 15 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 6c9ca4fcbc31..0bba12e4e559 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -2019,7 +2019,8 @@ static inline void req_set_fail_links(struct io_kiocb *req)
 		req->flags |= REQ_F_FAIL_LINK;
 }
 
-static void io_complete_rw_common(struct kiocb *kiocb, long res)
+static void io_complete_rw_common(struct kiocb *kiocb, long res,
+				  struct io_comp_state *cs)
 {
 	struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw.kiocb);
 	int cflags = 0;
@@ -2031,7 +2032,7 @@ static void io_complete_rw_common(struct kiocb *kiocb, long res)
 		req_set_fail_links(req);
 	if (req->flags & REQ_F_BUFFER_SELECTED)
 		cflags = io_put_kbuf(req);
-	io_cqring_add_event(req, res, cflags);
+	__io_req_complete(req, res, cflags, cs);
 }
 
 static void io_sq_thread_drop_mm(struct io_ring_ctx *ctx)
@@ -2141,14 +2142,18 @@ static bool io_rw_reissue(struct io_kiocb *req, long res)
 	return false;
 }
 
+static void __io_complete_rw(struct io_kiocb *req, long res, long res2,
+			     struct io_comp_state *cs)
+{
+	if (!io_rw_reissue(req, res))
+		io_complete_rw_common(&req->rw.kiocb, res, cs);
+}
+
 static void io_complete_rw(struct kiocb *kiocb, long res, long res2)
 {
 	struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw.kiocb);
 
-	if (!io_rw_reissue(req, res)) {
-		io_complete_rw_common(kiocb, res);
-		io_put_req(req);
-	}
+	__io_complete_rw(req, res, res2, NULL);
 }
 
 static void io_complete_rw_iopoll(struct kiocb *kiocb, long res, long res2)
@@ -2382,14 +2387,15 @@ static inline void io_rw_done(struct kiocb *kiocb, ssize_t ret)
 	}
 }
 
-static void kiocb_done(struct kiocb *kiocb, ssize_t ret)
+static void kiocb_done(struct kiocb *kiocb, ssize_t ret,
+		       struct io_comp_state *cs)
 {
 	struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw.kiocb);
 
 	if (req->flags & REQ_F_CUR_POS)
 		req->file->f_pos = kiocb->ki_pos;
 	if (ret >= 0 && kiocb->ki_complete == io_complete_rw)
-		io_complete_rw(kiocb, ret, 0);
+		__io_complete_rw(req, ret, 0, cs);
 	else
 		io_rw_done(kiocb, ret);
 }
@@ -2925,7 +2931,8 @@ static int io_iter_do_read(struct io_kiocb *req, struct iov_iter *iter)
 	return loop_rw_iter(READ, req->file, &req->rw.kiocb, iter);
 }
 
-static int io_read(struct io_kiocb *req, bool force_nonblock)
+static int io_read(struct io_kiocb *req, bool force_nonblock,
+		   struct io_comp_state *cs)
 {
 	struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs;
 	struct kiocb *kiocb = &req->rw.kiocb;
@@ -2960,7 +2967,7 @@ static int io_read(struct io_kiocb *req, bool force_nonblock)
 
 		/* Catch -EAGAIN return for forced non-blocking submission */
 		if (!force_nonblock || (ret2 != -EAGAIN && ret2 != -EIO)) {
-			kiocb_done(kiocb, ret2);
+			kiocb_done(kiocb, ret2, cs);
 		} else {
 			iter.count = iov_count;
 			iter.nr_segs = nr_segs;
@@ -2975,7 +2982,7 @@ copy_iov:
 				if (ret2 == -EIOCBQUEUED) {
 					goto out_free;
 				} else if (ret2 != -EAGAIN) {
-					kiocb_done(kiocb, ret2);
+					kiocb_done(kiocb, ret2, cs);
 					goto out_free;
 				}
 			}
@@ -3021,7 +3028,8 @@ static int io_write_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe,
 	return 0;
 }
 
-static int io_write(struct io_kiocb *req, bool force_nonblock)
+static int io_write(struct io_kiocb *req, bool force_nonblock,
+		    struct io_comp_state *cs)
 {
 	struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs;
 	struct kiocb *kiocb = &req->rw.kiocb;
@@ -3090,7 +3098,7 @@ static int io_write(struct io_kiocb *req, bool force_nonblock)
 		if (ret2 == -EOPNOTSUPP && (kiocb->ki_flags & IOCB_NOWAIT))
 			ret2 = -EAGAIN;
 		if (!force_nonblock || ret2 != -EAGAIN) {
-			kiocb_done(kiocb, ret2);
+			kiocb_done(kiocb, ret2, cs);
 		} else {
 			iter.count = iov_count;
 			iter.nr_segs = nr_segs;
@@ -5416,7 +5424,7 @@ static int io_issue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe,
 			if (ret < 0)
 				break;
 		}
-		ret = io_read(req, force_nonblock);
+		ret = io_read(req, force_nonblock, cs);
 		break;
 	case IORING_OP_WRITEV:
 	case IORING_OP_WRITE_FIXED:
@@ -5426,7 +5434,7 @@ static int io_issue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe,
 			if (ret < 0)
 				break;
 		}
-		ret = io_write(req, force_nonblock);
+		ret = io_write(req, force_nonblock, cs);
 		break;
 	case IORING_OP_FSYNC:
 		if (sqe) {

From c40f63790ec957e9449056fb78d8c2523eff96b5 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Thu, 25 Jun 2020 15:39:59 -0600
Subject: [PATCH 051/502] io_uring: use task_work for links if possible

Currently links are always done in an async fashion, unless we catch them
inline after we successfully complete a request without having to resort
to blocking. This isn't necessarily the most efficient approach, it'd be
more ideal if we could just use the task_work handling for this.

Outside of saving an async jump, we can also do less prep work for these
kinds of requests.

Running dependent links from the task_work handler yields some nice
performance benefits. As an example, examples/link-cp from the liburing
repository uses read+write links to implement a copy operation. Without
this patch, the a cache fold 4G file read from a VM runs in about 3
seconds:

$ time examples/link-cp /data/file /dev/null

real	0m2.986s
user	0m0.051s
sys	0m2.843s

and a subsequent cache hot run looks like this:

$ time examples/link-cp /data/file /dev/null

real	0m0.898s
user	0m0.069s
sys	0m0.797s

With this patch in place, the cold case takes about 2.4 seconds:

$ time examples/link-cp /data/file /dev/null

real	0m2.400s
user	0m0.020s
sys	0m2.366s

and the cache hot case looks like this:

$ time examples/link-cp /data/file /dev/null

real	0m0.676s
user	0m0.010s
sys	0m0.665s

As expected, the (mostly) cache hot case yields the biggest improvement,
running about 25% faster with this change, while the cache cold case
yields about a 20% increase in performance. Outside of the performance
increase, we're using less CPU as well, as we're not using the async
offload threads at all for this anymore.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 191 +++++++++++++++++++++++++++++++-------------------
 1 file changed, 117 insertions(+), 74 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 0bba12e4e559..b628e4429b75 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -898,6 +898,7 @@ enum io_mem_account {
 static void io_wq_submit_work(struct io_wq_work **workptr);
 static void io_cqring_fill_event(struct io_kiocb *req, long res);
 static void io_put_req(struct io_kiocb *req);
+static void io_double_put_req(struct io_kiocb *req);
 static void __io_double_put_req(struct io_kiocb *req);
 static struct io_kiocb *io_prep_linked_timeout(struct io_kiocb *req);
 static void io_queue_linked_timeout(struct io_kiocb *req);
@@ -951,6 +952,41 @@ static void __io_put_req_task(struct io_kiocb *req)
 		put_task_struct(req->task);
 }
 
+static void io_sq_thread_drop_mm(struct io_ring_ctx *ctx)
+{
+	struct mm_struct *mm = current->mm;
+
+	if (mm) {
+		kthread_unuse_mm(mm);
+		mmput(mm);
+	}
+}
+
+static int __io_sq_thread_acquire_mm(struct io_ring_ctx *ctx)
+{
+	if (!current->mm) {
+		if (unlikely(!mmget_not_zero(ctx->sqo_mm)))
+			return -EFAULT;
+		kthread_use_mm(ctx->sqo_mm);
+	}
+
+	return 0;
+}
+
+static int io_sq_thread_acquire_mm(struct io_ring_ctx *ctx,
+				   struct io_kiocb *req)
+{
+	if (!io_op_defs[req->opcode].needs_mm)
+		return 0;
+	return __io_sq_thread_acquire_mm(ctx);
+}
+
+static inline void req_set_fail_links(struct io_kiocb *req)
+{
+	if ((req->flags & (REQ_F_LINK | REQ_F_HARDLINK)) == REQ_F_LINK)
+		req->flags |= REQ_F_FAIL_LINK;
+}
+
 static void io_file_put_work(struct work_struct *work);
 
 /*
@@ -1664,6 +1700,64 @@ static void io_req_find_next(struct io_kiocb *req, struct io_kiocb **nxt)
 	}
 }
 
+static void __io_req_task_cancel(struct io_kiocb *req, int error)
+{
+	struct io_ring_ctx *ctx = req->ctx;
+
+	spin_lock_irq(&ctx->completion_lock);
+	io_cqring_fill_event(req, error);
+	io_commit_cqring(ctx);
+	spin_unlock_irq(&ctx->completion_lock);
+
+	io_cqring_ev_posted(ctx);
+	req_set_fail_links(req);
+	io_double_put_req(req);
+}
+
+static void io_req_task_cancel(struct callback_head *cb)
+{
+	struct io_kiocb *req = container_of(cb, struct io_kiocb, task_work);
+
+	__io_req_task_cancel(req, -ECANCELED);
+}
+
+static void __io_req_task_submit(struct io_kiocb *req)
+{
+	struct io_ring_ctx *ctx = req->ctx;
+
+	__set_current_state(TASK_RUNNING);
+	if (!__io_sq_thread_acquire_mm(ctx)) {
+		mutex_lock(&ctx->uring_lock);
+		__io_queue_sqe(req, NULL, NULL);
+		mutex_unlock(&ctx->uring_lock);
+	} else {
+		__io_req_task_cancel(req, -EFAULT);
+	}
+}
+
+static void io_req_task_submit(struct callback_head *cb)
+{
+	struct io_kiocb *req = container_of(cb, struct io_kiocb, task_work);
+
+	__io_req_task_submit(req);
+}
+
+static void io_req_task_queue(struct io_kiocb *req)
+{
+	struct task_struct *tsk = req->task;
+	int ret;
+
+	init_task_work(&req->task_work, io_req_task_submit);
+
+	ret = task_work_add(tsk, &req->task_work, true);
+	if (unlikely(ret)) {
+		init_task_work(&req->task_work, io_req_task_cancel);
+		tsk = io_wq_get_task(req->ctx->io_wq);
+		task_work_add(tsk, &req->task_work, true);
+	}
+	wake_up_process(tsk);
+}
+
 static void io_free_req(struct io_kiocb *req)
 {
 	struct io_kiocb *nxt = NULL;
@@ -1671,8 +1765,12 @@ static void io_free_req(struct io_kiocb *req)
 	io_req_find_next(req, &nxt);
 	__io_free_req(req);
 
-	if (nxt)
-		io_queue_async_work(nxt);
+	if (nxt) {
+		if (nxt->flags & REQ_F_WORK_INITIALIZED)
+			io_queue_async_work(nxt);
+		else
+			io_req_task_queue(nxt);
+	}
 }
 
 static void io_wq_assign_next(struct io_wq_work **workptr, struct io_kiocb *nxt)
@@ -2013,12 +2111,6 @@ static void kiocb_end_write(struct io_kiocb *req)
 	file_end_write(req->file);
 }
 
-static inline void req_set_fail_links(struct io_kiocb *req)
-{
-	if ((req->flags & (REQ_F_LINK | REQ_F_HARDLINK)) == REQ_F_LINK)
-		req->flags |= REQ_F_FAIL_LINK;
-}
-
 static void io_complete_rw_common(struct kiocb *kiocb, long res,
 				  struct io_comp_state *cs)
 {
@@ -2035,35 +2127,6 @@ static void io_complete_rw_common(struct kiocb *kiocb, long res,
 	__io_req_complete(req, res, cflags, cs);
 }
 
-static void io_sq_thread_drop_mm(struct io_ring_ctx *ctx)
-{
-	struct mm_struct *mm = current->mm;
-
-	if (mm) {
-		kthread_unuse_mm(mm);
-		mmput(mm);
-	}
-}
-
-static int __io_sq_thread_acquire_mm(struct io_ring_ctx *ctx)
-{
-	if (!current->mm) {
-		if (unlikely(!mmget_not_zero(ctx->sqo_mm)))
-			return -EFAULT;
-		kthread_use_mm(ctx->sqo_mm);
-	}
-
-	return 0;
-}
-
-static int io_sq_thread_acquire_mm(struct io_ring_ctx *ctx,
-				   struct io_kiocb *req)
-{
-	if (!io_op_defs[req->opcode].needs_mm)
-		return 0;
-	return __io_sq_thread_acquire_mm(ctx);
-}
-
 #ifdef CONFIG_BLOCK
 static bool io_resubmit_prep(struct io_kiocb *req, int error)
 {
@@ -2811,20 +2874,6 @@ static int io_read_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe,
 	return 0;
 }
 
-static void __io_async_buf_error(struct io_kiocb *req, int error)
-{
-	struct io_ring_ctx *ctx = req->ctx;
-
-	spin_lock_irq(&ctx->completion_lock);
-	io_cqring_fill_event(req, error);
-	io_commit_cqring(ctx);
-	spin_unlock_irq(&ctx->completion_lock);
-
-	io_cqring_ev_posted(ctx);
-	req_set_fail_links(req);
-	io_double_put_req(req);
-}
-
 static void io_async_buf_cancel(struct callback_head *cb)
 {
 	struct io_async_rw *rw;
@@ -2832,27 +2881,18 @@ static void io_async_buf_cancel(struct callback_head *cb)
 
 	rw = container_of(cb, struct io_async_rw, task_work);
 	req = rw->wpq.wait.private;
-	__io_async_buf_error(req, -ECANCELED);
+	__io_req_task_cancel(req, -ECANCELED);
 }
 
 static void io_async_buf_retry(struct callback_head *cb)
 {
 	struct io_async_rw *rw;
-	struct io_ring_ctx *ctx;
 	struct io_kiocb *req;
 
 	rw = container_of(cb, struct io_async_rw, task_work);
 	req = rw->wpq.wait.private;
-	ctx = req->ctx;
 
-	__set_current_state(TASK_RUNNING);
-	if (!__io_sq_thread_acquire_mm(ctx)) {
-		mutex_lock(&ctx->uring_lock);
-		__io_queue_sqe(req, NULL, NULL);
-		mutex_unlock(&ctx->uring_lock);
-	} else {
-		__io_async_buf_error(req, -EFAULT);
-	}
+	__io_req_task_submit(req);
 }
 
 static int io_async_buf_func(struct wait_queue_entry *wait, unsigned mode,
@@ -5218,23 +5258,25 @@ static int io_files_update(struct io_kiocb *req, bool force_nonblock,
 }
 
 static int io_req_defer_prep(struct io_kiocb *req,
-			     const struct io_uring_sqe *sqe)
+			     const struct io_uring_sqe *sqe, bool for_async)
 {
 	ssize_t ret = 0;
 
 	if (!sqe)
 		return 0;
 
-	io_req_init_async(req);
+	if (for_async || (req->flags & REQ_F_WORK_INITIALIZED)) {
+		io_req_init_async(req);
 
-	if (io_op_defs[req->opcode].file_table) {
-		ret = io_grab_files(req);
-		if (unlikely(ret))
-			return ret;
+		if (io_op_defs[req->opcode].file_table) {
+			ret = io_grab_files(req);
+			if (unlikely(ret))
+				return ret;
+		}
+
+		io_req_work_grab_env(req, &io_op_defs[req->opcode]);
 	}
 
-	io_req_work_grab_env(req, &io_op_defs[req->opcode]);
-
 	switch (req->opcode) {
 	case IORING_OP_NOP:
 		break;
@@ -5347,7 +5389,7 @@ static int io_req_defer(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 	if (!req->io) {
 		if (io_alloc_async_ctx(req))
 			return -EAGAIN;
-		ret = io_req_defer_prep(req, sqe);
+		ret = io_req_defer_prep(req, sqe, true);
 		if (ret < 0)
 			return ret;
 	}
@@ -5966,7 +6008,7 @@ fail_req:
 			ret = -EAGAIN;
 			if (io_alloc_async_ctx(req))
 				goto fail_req;
-			ret = io_req_defer_prep(req, sqe);
+			ret = io_req_defer_prep(req, sqe, true);
 			if (unlikely(ret < 0))
 				goto fail_req;
 		}
@@ -6022,13 +6064,14 @@ static int io_submit_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe,
 		if (io_alloc_async_ctx(req))
 			return -EAGAIN;
 
-		ret = io_req_defer_prep(req, sqe);
+		ret = io_req_defer_prep(req, sqe, false);
 		if (ret) {
 			/* fail even hard links since we don't submit */
 			head->flags |= REQ_F_FAIL_LINK;
 			return ret;
 		}
 		trace_io_uring_link(ctx, req, head);
+		io_get_req_task(req);
 		list_add_tail(&req->link_list, &head->link_list);
 
 		/* last request of a link, enqueue the link */
@@ -6048,7 +6091,7 @@ static int io_submit_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe,
 			if (io_alloc_async_ctx(req))
 				return -EAGAIN;
 
-			ret = io_req_defer_prep(req, sqe);
+			ret = io_req_defer_prep(req, sqe, true);
 			if (ret)
 				req->flags |= REQ_F_FAIL_LINK;
 			*link = req;

From e883a79d8ced8e123f8c4042a29a7524c39935ab Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Thu, 25 Jun 2020 18:20:53 +0300
Subject: [PATCH 052/502] io-wq: compact io-wq flags numbers

Renumerate IO_WQ flags, so they take adjacent bits

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io-wq.h | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/fs/io-wq.h b/fs/io-wq.h
index 071f1a997800..04239dfb12b0 100644
--- a/fs/io-wq.h
+++ b/fs/io-wq.h
@@ -5,10 +5,10 @@ struct io_wq;
 
 enum {
 	IO_WQ_WORK_CANCEL	= 1,
-	IO_WQ_WORK_HASHED	= 4,
-	IO_WQ_WORK_UNBOUND	= 32,
-	IO_WQ_WORK_NO_CANCEL	= 256,
-	IO_WQ_WORK_CONCURRENT	= 512,
+	IO_WQ_WORK_HASHED	= 2,
+	IO_WQ_WORK_UNBOUND	= 4,
+	IO_WQ_WORK_NO_CANCEL	= 8,
+	IO_WQ_WORK_CONCURRENT	= 16,
 
 	IO_WQ_HASH_SHIFT	= 24,	/* upper 8 bits are used for hash key */
 };

From f4db7182e0de981a3f1b356e0cf43c6815423055 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Thu, 25 Jun 2020 18:20:54 +0300
Subject: [PATCH 053/502] io-wq: return next work from ->do_work() directly

It's easier to return next work from ->do_work() than
having an in-out argument. Looks nicer and easier to compile.
Also, merge io_wq_assign_next() into its only user.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io-wq.c    |  8 +++-----
 fs/io-wq.h    |  2 +-
 fs/io_uring.c | 57 +++++++++++++++++++++------------------------------
 3 files changed, 27 insertions(+), 40 deletions(-)

diff --git a/fs/io-wq.c b/fs/io-wq.c
index 47c5f3aeb460..72f759e1d6eb 100644
--- a/fs/io-wq.c
+++ b/fs/io-wq.c
@@ -523,9 +523,8 @@ get_next:
 				work->flags |= IO_WQ_WORK_CANCEL;
 
 			hash = io_get_work_hash(work);
-			linked = old_work = work;
-			wq->do_work(&linked);
-			linked = (old_work == linked) ? NULL : linked;
+			old_work = work;
+			linked = wq->do_work(work);
 
 			work = next_hashed;
 			if (!work && linked && !io_wq_is_hashed(linked)) {
@@ -781,8 +780,7 @@ static void io_run_cancel(struct io_wq_work *work, struct io_wqe *wqe)
 		struct io_wq_work *old_work = work;
 
 		work->flags |= IO_WQ_WORK_CANCEL;
-		wq->do_work(&work);
-		work = (work == old_work) ? NULL : work;
+		work = wq->do_work(work);
 		wq->free_work(old_work);
 	} while (work);
 }
diff --git a/fs/io-wq.h b/fs/io-wq.h
index 04239dfb12b0..114f12ec2d65 100644
--- a/fs/io-wq.h
+++ b/fs/io-wq.h
@@ -101,7 +101,7 @@ static inline struct io_wq_work *wq_next_work(struct io_wq_work *work)
 }
 
 typedef void (free_work_fn)(struct io_wq_work *);
-typedef void (io_wq_work_fn)(struct io_wq_work **);
+typedef struct io_wq_work *(io_wq_work_fn)(struct io_wq_work *);
 
 struct io_wq_data {
 	struct user_struct *user;
diff --git a/fs/io_uring.c b/fs/io_uring.c
index b628e4429b75..2e44b3788265 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -895,7 +895,6 @@ enum io_mem_account {
 	ACCT_PINNED,
 };
 
-static void io_wq_submit_work(struct io_wq_work **workptr);
 static void io_cqring_fill_event(struct io_kiocb *req, long res);
 static void io_put_req(struct io_kiocb *req);
 static void io_double_put_req(struct io_kiocb *req);
@@ -1773,20 +1772,6 @@ static void io_free_req(struct io_kiocb *req)
 	}
 }
 
-static void io_wq_assign_next(struct io_wq_work **workptr, struct io_kiocb *nxt)
-{
-	struct io_kiocb *link;
-	const struct io_op_def *def = &io_op_defs[nxt->opcode];
-
-	if ((nxt->flags & REQ_F_ISREG) && def->hash_reg_file)
-		io_wq_hash_work(&nxt->work, file_inode(nxt->file));
-
-	*workptr = &nxt->work;
-	link = io_prep_linked_timeout(nxt);
-	if (link)
-		nxt->flags |= REQ_F_QUEUE_TIMEOUT;
-}
-
 /*
  * Drop reference to request, return next in chain (if there is one) if this
  * was the last reference to this request.
@@ -1806,24 +1791,29 @@ static void io_put_req(struct io_kiocb *req)
 		io_free_req(req);
 }
 
-static void io_steal_work(struct io_kiocb *req,
-			  struct io_wq_work **workptr)
+static struct io_wq_work *io_steal_work(struct io_kiocb *req)
 {
-	/*
-	 * It's in an io-wq worker, so there always should be at least
-	 * one reference, which will be dropped in io_put_work() just
-	 * after the current handler returns.
-	 *
-	 * It also means, that if the counter dropped to 1, then there is
-	 * no asynchronous users left, so it's safe to steal the next work.
-	 */
-	if (refcount_read(&req->refs) == 1) {
-		struct io_kiocb *nxt = NULL;
+	struct io_kiocb *link, *nxt = NULL;
 
-		io_req_find_next(req, &nxt);
-		if (nxt)
-			io_wq_assign_next(workptr, nxt);
-	}
+	/*
+	 * A ref is owned by io-wq in which context we're. So, if that's the
+	 * last one, it's safe to steal next work. False negatives are Ok,
+	 * it just will be re-punted async in io_put_work()
+	 */
+	if (refcount_read(&req->refs) != 1)
+		return NULL;
+
+	io_req_find_next(req, &nxt);
+	if (!nxt)
+		return NULL;
+
+	if ((nxt->flags & REQ_F_ISREG) && io_op_defs[nxt->opcode].hash_reg_file)
+		io_wq_hash_work(&nxt->work, file_inode(nxt->file));
+
+	link = io_prep_linked_timeout(nxt);
+	if (link)
+		nxt->flags |= REQ_F_QUEUE_TIMEOUT;
+	return &nxt->work;
 }
 
 /*
@@ -5718,9 +5708,8 @@ static void io_arm_async_linked_timeout(struct io_kiocb *req)
 	io_queue_linked_timeout(link);
 }
 
-static void io_wq_submit_work(struct io_wq_work **workptr)
+static struct io_wq_work *io_wq_submit_work(struct io_wq_work *work)
 {
-	struct io_wq_work *work = *workptr;
 	struct io_kiocb *req = container_of(work, struct io_kiocb, work);
 	int ret = 0;
 
@@ -5751,7 +5740,7 @@ static void io_wq_submit_work(struct io_wq_work **workptr)
 		io_req_complete(req, ret);
 	}
 
-	io_steal_work(req, workptr);
+	return io_steal_work(req);
 }
 
 static inline struct file *io_file_from_index(struct io_ring_ctx *ctx,

From 1e16c2f917a59d27fb6b540c44d66978c8ad29ef Mon Sep 17 00:00:00 2001
From: Randy Dunlap <rdunlap@infradead.org>
Date: Fri, 26 Jun 2020 16:32:50 -0700
Subject: [PATCH 054/502] io_uring: fix function args for !CONFIG_NET
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Fix build errors when CONFIG_NET is not set/enabled:

../fs/io_uring.c:5472:10: error: too many arguments to function ‘io_sendmsg’
../fs/io_uring.c:5474:10: error: too many arguments to function ‘io_send’
../fs/io_uring.c:5484:10: error: too many arguments to function ‘io_recvmsg’
../fs/io_uring.c:5486:10: error: too many arguments to function ‘io_recv’
../fs/io_uring.c:5510:9: error: too many arguments to function ‘io_accept’
../fs/io_uring.c:5518:9: error: too many arguments to function ‘io_connect’

Signed-off-by: Randy Dunlap <rdunlap@infradead.org>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: io-uring@vger.kernel.org
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 18 ++++++++++++------
 1 file changed, 12 insertions(+), 6 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index af4d7a5c49f4..43ddda2a3d49 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -4360,12 +4360,14 @@ static int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 	return -EOPNOTSUPP;
 }
 
-static int io_sendmsg(struct io_kiocb *req, bool force_nonblock)
+static int io_sendmsg(struct io_kiocb *req, bool force_nonblock,
+		      struct io_comp_state *cs)
 {
 	return -EOPNOTSUPP;
 }
 
-static int io_send(struct io_kiocb *req, bool force_nonblock)
+static int io_send(struct io_kiocb *req, bool force_nonblock,
+		   struct io_comp_state *cs)
 {
 	return -EOPNOTSUPP;
 }
@@ -4376,12 +4378,14 @@ static int io_recvmsg_prep(struct io_kiocb *req,
 	return -EOPNOTSUPP;
 }
 
-static int io_recvmsg(struct io_kiocb *req, bool force_nonblock)
+static int io_recvmsg(struct io_kiocb *req, bool force_nonblock,
+		      struct io_comp_state *cs)
 {
 	return -EOPNOTSUPP;
 }
 
-static int io_recv(struct io_kiocb *req, bool force_nonblock)
+static int io_recv(struct io_kiocb *req, bool force_nonblock,
+		   struct io_comp_state *cs)
 {
 	return -EOPNOTSUPP;
 }
@@ -4391,7 +4395,8 @@ static int io_accept_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 	return -EOPNOTSUPP;
 }
 
-static int io_accept(struct io_kiocb *req, bool force_nonblock)
+static int io_accept(struct io_kiocb *req, bool force_nonblock,
+		     struct io_comp_state *cs)
 {
 	return -EOPNOTSUPP;
 }
@@ -4401,7 +4406,8 @@ static int io_connect_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 	return -EOPNOTSUPP;
 }
 
-static int io_connect(struct io_kiocb *req, bool force_nonblock)
+static int io_connect(struct io_kiocb *req, bool force_nonblock,
+		      struct io_comp_state *cs)
 {
 	return -EOPNOTSUPP;
 }

From 8ef77766ba8694968ed4ba24311b4bacee14f235 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Sat, 27 Jun 2020 14:04:59 +0300
Subject: [PATCH 055/502] io_uring: fix req->work corruption

req->work and req->task_work are in a union, so io_req_task_queue() screws
everything that was in work. De-union them for now.

[  704.367253] BUG: unable to handle page fault for address:
	ffffffffaf7330d0
[  704.367256] #PF: supervisor write access in kernel mode
[  704.367256] #PF: error_code(0x0003) - permissions violation
[  704.367261] CPU: 6 PID: 1654 Comm: io_wqe_worker-0 Tainted: G
I       5.8.0-rc2-00038-ge28d0bdc4863-dirty #498
[  704.367265] RIP: 0010:_raw_spin_lock+0x1e/0x36
...
[  704.367276]  __alloc_fd+0x35/0x150
[  704.367279]  __get_unused_fd_flags+0x25/0x30
[  704.367280]  io_openat2+0xcb/0x1b0
[  704.367283]  io_issue_sqe+0x36a/0x1320
[  704.367294]  io_wq_submit_work+0x58/0x160
[  704.367295]  io_worker_handle_work+0x2a3/0x430
[  704.367296]  io_wqe_worker+0x2a0/0x350
[  704.367301]  kthread+0x136/0x180
[  704.367304]  ret_from_fork+0x22/0x30

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 43ddda2a3d49..dcf3ffb5ecf3 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -668,12 +668,12 @@ struct io_kiocb {
 		 * restore the work, if needed.
 		 */
 		struct {
-			struct callback_head	task_work;
 			struct hlist_node	hash_node;
 			struct async_poll	*apoll;
 		};
 		struct io_wq_work	work;
 	};
+	struct callback_head	task_work;
 };
 
 #define IO_IOPOLL_BATCH			8

From 906a8c3fdbc367325d4200e39212a2a7715b7b0e Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Sat, 27 Jun 2020 14:04:55 +0300
Subject: [PATCH 056/502] io_uring: fix punting req w/o grabbed env

It's not enough to check for REQ_F_WORK_INITIALIZED and punt async
assuming that io_req_work_grab_env() was called, it may not have been.
E.g. io_close_prep() and personality path set the flag without further
async init.

As a quick fix, always pass next work through io_req_task_queue().

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index dcf3ffb5ecf3..483457f6a7df 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -1766,12 +1766,8 @@ static void io_free_req(struct io_kiocb *req)
 	io_req_find_next(req, &nxt);
 	__io_free_req(req);
 
-	if (nxt) {
-		if (nxt->flags & REQ_F_WORK_INITIALIZED)
-			io_queue_async_work(nxt);
-		else
-			io_req_task_queue(nxt);
-	}
+	if (nxt)
+		io_req_task_queue(nxt);
 }
 
 /*

From 1bcb8c5d65a845e0ecb9e82237c399b29b8d15ea Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Sat, 27 Jun 2020 14:04:56 +0300
Subject: [PATCH 057/502] io_uring: fix feeding io-wq with uninit reqs

io_steal_work() can't be sure that @nxt has req->work properly set, so we
can't pass it to io-wq as is.

A dirty quick fix -- drag it through io_req_task_queue(), and always
return NULL from io_steal_work().

e.g.

[   50.770161] BUG: kernel NULL pointer dereference, address: 00000000
[   50.770164] #PF: supervisor write access in kernel mode
[   50.770164] #PF: error_code(0x0002) - not-present page
[   50.770168] CPU: 1 PID: 1448 Comm: io_wqe_worker-0 Tainted: G
	I       5.8.0-rc2-00035-g2237d76530eb-dirty #494
[   50.770172] RIP: 0010:override_creds+0x19/0x30
...
[   50.770183]  io_worker_handle_work+0x25c/0x430
[   50.770185]  io_wqe_worker+0x2a0/0x350
[   50.770190]  kthread+0x136/0x180
[   50.770194]  ret_from_fork+0x22/0x30

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 15 ++++++++++-----
 1 file changed, 10 insertions(+), 5 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 483457f6a7df..658949bed77f 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -1791,7 +1791,7 @@ static void io_put_req(struct io_kiocb *req)
 
 static struct io_wq_work *io_steal_work(struct io_kiocb *req)
 {
-	struct io_kiocb *link, *nxt = NULL;
+	struct io_kiocb *nxt = NULL;
 
 	/*
 	 * A ref is owned by io-wq in which context we're. So, if that's the
@@ -1808,10 +1808,15 @@ static struct io_wq_work *io_steal_work(struct io_kiocb *req)
 	if ((nxt->flags & REQ_F_ISREG) && io_op_defs[nxt->opcode].hash_reg_file)
 		io_wq_hash_work(&nxt->work, file_inode(nxt->file));
 
-	link = io_prep_linked_timeout(nxt);
-	if (link)
-		nxt->flags |= REQ_F_QUEUE_TIMEOUT;
-	return &nxt->work;
+	io_req_task_queue(nxt);
+	/*
+	 * If we're going to return actual work, here should be timeout prep:
+	 *
+	 * link = io_prep_linked_timeout(nxt);
+	 * if (link)
+	 *	nxt->flags |= REQ_F_QUEUE_TIMEOUT;
+	 */
+	return NULL;
 }
 
 /*

From a6d45dd0d43e6d1275e002704540688b6768bc22 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Sat, 27 Jun 2020 14:04:57 +0300
Subject: [PATCH 058/502] io_uring: don't mark link's head for_async

No reason to mark a head of a link as for-async in io_req_defer_prep().
grab_env(), etc. That will be done further during submission if
neccessary.

Mark for_async=false saving extra grab_env() in many cases.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 658949bed77f..545b137c7b4a 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -6092,7 +6092,7 @@ static int io_submit_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe,
 			if (io_alloc_async_ctx(req))
 				return -EAGAIN;
 
-			ret = io_req_defer_prep(req, sqe, true);
+			ret = io_req_defer_prep(req, sqe, false);
 			if (ret)
 				req->flags |= REQ_F_FAIL_LINK;
 			*link = req;

From 710c2bfb66474a186b0196e3342d43db0e6c04e1 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Sat, 27 Jun 2020 14:04:58 +0300
Subject: [PATCH 059/502] io_uring: fix missing io_grab_files()

We won't have valid ring_fd, ring_file in task work. Grab files early.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 545b137c7b4a..4a9929c0b4ad 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -5270,15 +5270,15 @@ static int io_req_defer_prep(struct io_kiocb *req,
 	if (!sqe)
 		return 0;
 
+	if (io_op_defs[req->opcode].file_table) {
+		io_req_init_async(req);
+		ret = io_grab_files(req);
+		if (unlikely(ret))
+			return ret;
+	}
+
 	if (for_async || (req->flags & REQ_F_WORK_INITIALIZED)) {
 		io_req_init_async(req);
-
-		if (io_op_defs[req->opcode].file_table) {
-			ret = io_grab_files(req);
-			if (unlikely(ret))
-				return ret;
-		}
-
 		io_req_work_grab_env(req, &io_op_defs[req->opcode]);
 	}
 

From 8c9cb6cd9a46ae6fb7cb6c39cf6a48a53440feef Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Sun, 28 Jun 2020 12:52:29 +0300
Subject: [PATCH 060/502] io_uring: fix refs underflow in io_iopoll_queue()

Now io_complete_rw_common() puts a ref, extra io_req_put() in
io_iopoll_queue() causes undeflow. Remove it.

[  455.998620] refcount_t: underflow; use-after-free.
[  455.998743] WARNING: CPU: 6 PID: 285394 at lib/refcount.c:28
	refcount_warn_saturate+0xae/0xf0
[  455.998772] CPU: 6 PID: 285394 Comm: read-write2 Tainted: G
          I E     5.8.0-rc2-00048-g1b1aa738f167-dirty #509
[  455.998772] RIP: 0010:refcount_warn_saturate+0xae/0xf0
...
[  455.998778] Call Trace:
[  455.998778]  io_put_req+0x44/0x50
[  455.998778]  io_iopoll_complete+0x245/0x370
[  455.998779]  io_iopoll_getevents+0x12f/0x1a0
[  455.998779]  io_iopoll_reap_events.part.0+0x5e/0xa0
[  455.998780]  io_ring_ctx_wait_and_kill+0x132/0x1c0
[  455.998780]  io_uring_release+0x20/0x30
[  455.998780]  __fput+0xcd/0x230
[  455.998781]  ____fput+0xe/0x10
[  455.998781]  task_work_run+0x67/0xa0
[  455.998781]  do_exit+0x35d/0xb70
[  455.998782]  do_group_exit+0x43/0xa0
[  455.998783]  get_signal+0x140/0x900
[  455.998783]  do_signal+0x37/0x780
[  455.998784]  __prepare_exit_to_usermode+0x126/0x1c0
[  455.998785]  __syscall_return_slowpath+0x3b/0x1c0
[  455.998785]  do_syscall_64+0x5f/0xa0
[  455.998785]  entry_SYSCALL_64_after_hwframe+0x44/0xa9

Fixes: a1d7c393c47 ("io_uring: enable READ/WRITE to use deferred completions")
Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 4a9929c0b4ad..ab9f2f3a9b56 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -1904,7 +1904,6 @@ static void io_iopoll_queue(struct list_head *again)
 		/* shouldn't happen unless io_uring is dying, cancel reqs */
 		if (unlikely(!current->mm)) {
 			io_complete_rw_common(&req->rw.kiocb, -EAGAIN, NULL);
-			io_put_req(req);
 			continue;
 		}
 

From e6543a816edca00b6b4c48625d142059d7211059 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Sun, 28 Jun 2020 12:52:30 +0300
Subject: [PATCH 061/502] io_uring: remove inflight batching in free_many()

io_free_req_many() is used only for iopoll requests, i.e. reads/writes.
Hence no need to batch inflight unhooking. For safety, it'll be done by
io_dismantle_req(), which replaces __io_req_aux_free(), and looks more
solid and cleaner.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 41 ++++++++---------------------------------
 1 file changed, 8 insertions(+), 33 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index ab9f2f3a9b56..9863cec8020f 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -1504,7 +1504,7 @@ static inline void io_put_file(struct io_kiocb *req, struct file *file,
 		fput(file);
 }
 
-static void __io_req_aux_free(struct io_kiocb *req)
+static void io_dismantle_req(struct io_kiocb *req)
 {
 	if (req->flags & REQ_F_NEED_CLEANUP)
 		io_cleanup_req(req);
@@ -1514,11 +1514,6 @@ static void __io_req_aux_free(struct io_kiocb *req)
 		io_put_file(req, req->file, (req->flags & REQ_F_FIXED_FILE));
 	__io_put_req_task(req);
 	io_req_work_drop_env(req);
-}
-
-static void __io_free_req(struct io_kiocb *req)
-{
-	__io_req_aux_free(req);
 
 	if (req->flags & REQ_F_INFLIGHT) {
 		struct io_ring_ctx *ctx = req->ctx;
@@ -1530,7 +1525,11 @@ static void __io_free_req(struct io_kiocb *req)
 			wake_up(&ctx->inflight_wait);
 		spin_unlock_irqrestore(&ctx->inflight_lock, flags);
 	}
+}
 
+static void __io_free_req(struct io_kiocb *req)
+{
+	io_dismantle_req(req);
 	percpu_ref_put(&req->ctx->refs);
 	if (likely(!io_is_fallback_req(req)))
 		kmem_cache_free(req_cachep, req);
@@ -1549,35 +1548,11 @@ static void io_free_req_many(struct io_ring_ctx *ctx, struct req_batch *rb)
 	if (!rb->to_free)
 		return;
 	if (rb->need_iter) {
-		int i, inflight = 0;
-		unsigned long flags;
+		int i;
 
-		for (i = 0; i < rb->to_free; i++) {
-			struct io_kiocb *req = rb->reqs[i];
-
-			if (req->flags & REQ_F_INFLIGHT)
-				inflight++;
-			__io_req_aux_free(req);
-		}
-		if (!inflight)
-			goto do_free;
-
-		spin_lock_irqsave(&ctx->inflight_lock, flags);
-		for (i = 0; i < rb->to_free; i++) {
-			struct io_kiocb *req = rb->reqs[i];
-
-			if (req->flags & REQ_F_INFLIGHT) {
-				list_del(&req->inflight_entry);
-				if (!--inflight)
-					break;
-			}
-		}
-		spin_unlock_irqrestore(&ctx->inflight_lock, flags);
-
-		if (waitqueue_active(&ctx->inflight_wait))
-			wake_up(&ctx->inflight_wait);
+		for (i = 0; i < rb->to_free; i++)
+			io_dismantle_req(rb->reqs[i]);
 	}
-do_free:
 	kmem_cache_free_bulk(req_cachep, rb->to_free, rb->reqs);
 	percpu_ref_put_many(&ctx->refs, rb->to_free);
 	rb->to_free = rb->need_iter = 0;

From 2757a23e7f6441eabf605ca59eeb88c34071757d Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Sun, 28 Jun 2020 12:52:31 +0300
Subject: [PATCH 062/502] io_uring: dismantle req early and remove need_iter

Every request in io_req_multi_free() is has ->file set. Instead of
pointlessly defering and counting reqs with file, dismantle it on place
and save for batch dealloc.

It also saves us from potentially skipping io_cleanup_req(), put_task(),
etc. Never happens though, becacuse ->file is always there.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 14 +++-----------
 1 file changed, 3 insertions(+), 11 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 9863cec8020f..8cb5252269d7 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -1540,22 +1540,16 @@ static void __io_free_req(struct io_kiocb *req)
 struct req_batch {
 	void *reqs[IO_IOPOLL_BATCH];
 	int to_free;
-	int need_iter;
 };
 
 static void io_free_req_many(struct io_ring_ctx *ctx, struct req_batch *rb)
 {
 	if (!rb->to_free)
 		return;
-	if (rb->need_iter) {
-		int i;
 
-		for (i = 0; i < rb->to_free; i++)
-			io_dismantle_req(rb->reqs[i]);
-	}
 	kmem_cache_free_bulk(req_cachep, rb->to_free, rb->reqs);
 	percpu_ref_put_many(&ctx->refs, rb->to_free);
-	rb->to_free = rb->need_iter = 0;
+	rb->to_free = 0;
 }
 
 static bool io_link_cancel_timeout(struct io_kiocb *req)
@@ -1846,9 +1840,7 @@ static inline bool io_req_multi_free(struct req_batch *rb, struct io_kiocb *req)
 	if ((req->flags & REQ_F_LINK_HEAD) || io_is_fallback_req(req))
 		return false;
 
-	if (req->file || req->io)
-		rb->need_iter++;
-
+	io_dismantle_req(req);
 	rb->reqs[rb->to_free++] = req;
 	if (unlikely(rb->to_free == ARRAY_SIZE(rb->reqs)))
 		io_free_req_many(req->ctx, rb);
@@ -1900,7 +1892,7 @@ static void io_iopoll_complete(struct io_ring_ctx *ctx, unsigned int *nr_events,
 	/* order with ->result store in io_complete_rw_iopoll() */
 	smp_rmb();
 
-	rb.to_free = rb.need_iter = 0;
+	rb.to_free = 0;
 	while (!list_empty(done)) {
 		int cflags = 0;
 

From c3524383333e4ff2f720ab0c02b3a329f72de78b Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Sun, 28 Jun 2020 12:52:32 +0300
Subject: [PATCH 063/502] io_uring: batch-free linked requests as well

There is no reason to not batch deallocation of linked requests. Take
away its next req first and handle it as everything else in
io_req_multi_free().

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 26 ++++++++++++++++----------
 1 file changed, 16 insertions(+), 10 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 8cb5252269d7..af8d1d64f858 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -1728,17 +1728,21 @@ static void io_req_task_queue(struct io_kiocb *req)
 	wake_up_process(tsk);
 }
 
-static void io_free_req(struct io_kiocb *req)
+static void io_queue_next(struct io_kiocb *req)
 {
 	struct io_kiocb *nxt = NULL;
 
 	io_req_find_next(req, &nxt);
-	__io_free_req(req);
-
 	if (nxt)
 		io_req_task_queue(nxt);
 }
 
+static void io_free_req(struct io_kiocb *req)
+{
+	io_queue_next(req);
+	__io_free_req(req);
+}
+
 /*
  * Drop reference to request, return next in chain (if there is one) if this
  * was the last reference to this request.
@@ -1835,16 +1839,19 @@ static inline unsigned int io_sqring_entries(struct io_ring_ctx *ctx)
 	return smp_load_acquire(&rings->sq.tail) - ctx->cached_sq_head;
 }
 
-static inline bool io_req_multi_free(struct req_batch *rb, struct io_kiocb *req)
+static inline void io_req_multi_free(struct req_batch *rb, struct io_kiocb *req)
 {
-	if ((req->flags & REQ_F_LINK_HEAD) || io_is_fallback_req(req))
-		return false;
+	if (unlikely(io_is_fallback_req(req))) {
+		io_free_req(req);
+		return;
+	}
+	if (req->flags & REQ_F_LINK_HEAD)
+		io_queue_next(req);
 
 	io_dismantle_req(req);
 	rb->reqs[rb->to_free++] = req;
 	if (unlikely(rb->to_free == ARRAY_SIZE(rb->reqs)))
 		io_free_req_many(req->ctx, rb);
-	return true;
 }
 
 static int io_put_kbuf(struct io_kiocb *req)
@@ -1910,9 +1917,8 @@ static void io_iopoll_complete(struct io_ring_ctx *ctx, unsigned int *nr_events,
 		__io_cqring_fill_event(req, req->result, cflags);
 		(*nr_events)++;
 
-		if (refcount_dec_and_test(&req->refs) &&
-		    !io_req_multi_free(&rb, req))
-			io_free_req(req);
+		if (refcount_dec_and_test(&req->refs))
+			io_req_multi_free(&rb, req);
 	}
 
 	io_commit_cqring(ctx);

From 2d6500d44c1374808040d120e625a22b013c9f0d Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Sun, 28 Jun 2020 12:52:33 +0300
Subject: [PATCH 064/502] io_uring: cosmetic changes for batch free

Move all batch free bits close to each other and rename in a consistent
way.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 69 +++++++++++++++++++++++++++------------------------
 1 file changed, 37 insertions(+), 32 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index af8d1d64f858..18a452ac81cc 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -1537,21 +1537,6 @@ static void __io_free_req(struct io_kiocb *req)
 		clear_bit_unlock(0, (unsigned long *) &req->ctx->fallback_req);
 }
 
-struct req_batch {
-	void *reqs[IO_IOPOLL_BATCH];
-	int to_free;
-};
-
-static void io_free_req_many(struct io_ring_ctx *ctx, struct req_batch *rb)
-{
-	if (!rb->to_free)
-		return;
-
-	kmem_cache_free_bulk(req_cachep, rb->to_free, rb->reqs);
-	percpu_ref_put_many(&ctx->refs, rb->to_free);
-	rb->to_free = 0;
-}
-
 static bool io_link_cancel_timeout(struct io_kiocb *req)
 {
 	struct io_ring_ctx *ctx = req->ctx;
@@ -1743,6 +1728,41 @@ static void io_free_req(struct io_kiocb *req)
 	__io_free_req(req);
 }
 
+struct req_batch {
+	void *reqs[IO_IOPOLL_BATCH];
+	int to_free;
+};
+
+static void __io_req_free_batch_flush(struct io_ring_ctx *ctx,
+				      struct req_batch *rb)
+{
+	kmem_cache_free_bulk(req_cachep, rb->to_free, rb->reqs);
+	percpu_ref_put_many(&ctx->refs, rb->to_free);
+	rb->to_free = 0;
+}
+
+static void io_req_free_batch_finish(struct io_ring_ctx *ctx,
+				     struct req_batch *rb)
+{
+	if (rb->to_free)
+		__io_req_free_batch_flush(ctx, rb);
+}
+
+static void io_req_free_batch(struct req_batch *rb, struct io_kiocb *req)
+{
+	if (unlikely(io_is_fallback_req(req))) {
+		io_free_req(req);
+		return;
+	}
+	if (req->flags & REQ_F_LINK_HEAD)
+		io_queue_next(req);
+
+	io_dismantle_req(req);
+	rb->reqs[rb->to_free++] = req;
+	if (unlikely(rb->to_free == ARRAY_SIZE(rb->reqs)))
+		__io_req_free_batch_flush(req->ctx, rb);
+}
+
 /*
  * Drop reference to request, return next in chain (if there is one) if this
  * was the last reference to this request.
@@ -1839,21 +1859,6 @@ static inline unsigned int io_sqring_entries(struct io_ring_ctx *ctx)
 	return smp_load_acquire(&rings->sq.tail) - ctx->cached_sq_head;
 }
 
-static inline void io_req_multi_free(struct req_batch *rb, struct io_kiocb *req)
-{
-	if (unlikely(io_is_fallback_req(req))) {
-		io_free_req(req);
-		return;
-	}
-	if (req->flags & REQ_F_LINK_HEAD)
-		io_queue_next(req);
-
-	io_dismantle_req(req);
-	rb->reqs[rb->to_free++] = req;
-	if (unlikely(rb->to_free == ARRAY_SIZE(rb->reqs)))
-		io_free_req_many(req->ctx, rb);
-}
-
 static int io_put_kbuf(struct io_kiocb *req)
 {
 	struct io_buffer *kbuf;
@@ -1918,13 +1923,13 @@ static void io_iopoll_complete(struct io_ring_ctx *ctx, unsigned int *nr_events,
 		(*nr_events)++;
 
 		if (refcount_dec_and_test(&req->refs))
-			io_req_multi_free(&rb, req);
+			io_req_free_batch(&rb, req);
 	}
 
 	io_commit_cqring(ctx);
 	if (ctx->flags & IORING_SETUP_SQPOLL)
 		io_cqring_ev_posted(ctx);
-	io_free_req_many(ctx, &rb);
+	io_req_free_batch_finish(ctx, &rb);
 
 	if (!list_empty(&again))
 		io_iopoll_queue(&again);

From 9b0d911acce00b67f7e7336f838b732de7d917d6 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Sun, 28 Jun 2020 12:52:34 +0300
Subject: [PATCH 065/502] io_uring: kill REQ_F_LINK_NEXT

After pulling nxt from a request, it's no more a links head, so clear
REQ_F_LINK_HEAD. Absence of this flag also indicates that there are no
linked requests, so replacing REQ_F_LINK_NEXT, which can be killed.

Linked timeouts also behave leaving the flag intact when necessary.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 9 +--------
 1 file changed, 1 insertion(+), 8 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 18a452ac81cc..14c5655c0434 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -526,7 +526,6 @@ enum {
 	REQ_F_BUFFER_SELECT_BIT	= IOSQE_BUFFER_SELECT_BIT,
 
 	REQ_F_LINK_HEAD_BIT,
-	REQ_F_LINK_NEXT_BIT,
 	REQ_F_FAIL_LINK_BIT,
 	REQ_F_INFLIGHT_BIT,
 	REQ_F_CUR_POS_BIT,
@@ -565,8 +564,6 @@ enum {
 
 	/* head of a link */
 	REQ_F_LINK_HEAD		= BIT(REQ_F_LINK_HEAD_BIT),
-	/* already grabbed next link */
-	REQ_F_LINK_NEXT		= BIT(REQ_F_LINK_NEXT_BIT),
 	/* fail rest of links */
 	REQ_F_FAIL_LINK		= BIT(REQ_F_FAIL_LINK_BIT),
 	/* on inflight list */
@@ -1559,10 +1556,6 @@ static void io_req_link_next(struct io_kiocb *req, struct io_kiocb **nxtptr)
 	struct io_ring_ctx *ctx = req->ctx;
 	bool wake_ev = false;
 
-	/* Already got next link */
-	if (req->flags & REQ_F_LINK_NEXT)
-		return;
-
 	/*
 	 * The list should never be empty when we are called here. But could
 	 * potentially happen if the chain is messed up, check to be on the
@@ -1587,7 +1580,6 @@ static void io_req_link_next(struct io_kiocb *req, struct io_kiocb **nxtptr)
 		break;
 	}
 
-	req->flags |= REQ_F_LINK_NEXT;
 	if (wake_ev)
 		io_cqring_ev_posted(ctx);
 }
@@ -1628,6 +1620,7 @@ static void io_req_find_next(struct io_kiocb *req, struct io_kiocb **nxt)
 {
 	if (likely(!(req->flags & REQ_F_LINK_HEAD)))
 		return;
+	req->flags &= ~REQ_F_LINK_HEAD;
 
 	/*
 	 * If LINK is set, we have dependent requests in this chain. If we

From 6795c5aba247653f99d1f336ff496dd74659b322 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Sun, 28 Jun 2020 12:52:35 +0300
Subject: [PATCH 066/502] io_uring: clean up req->result setting by rw

Assign req->result to io_size early in io_{read,write}(), it's enough
and makes it more straightforward.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 9 ++-------
 1 file changed, 2 insertions(+), 7 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 14c5655c0434..f283d111666b 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -2384,7 +2384,6 @@ static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe,
 
 		kiocb->ki_flags |= IOCB_HIPRI;
 		kiocb->ki_complete = io_complete_rw_iopoll;
-		req->result = 0;
 		req->iopoll_completed = 0;
 	} else {
 		if (kiocb->ki_flags & IOCB_HIPRI)
@@ -2957,10 +2956,8 @@ static int io_read(struct io_kiocb *req, bool force_nonblock,
 	if (!force_nonblock)
 		kiocb->ki_flags &= ~IOCB_NOWAIT;
 
-	req->result = 0;
 	io_size = ret;
-	if (req->flags & REQ_F_LINK_HEAD)
-		req->result = io_size;
+	req->result = io_size;
 
 	/* If the file doesn't support async, just async punt */
 	if (force_nonblock && !io_file_supports_async(req->file, READ))
@@ -3054,10 +3051,8 @@ static int io_write(struct io_kiocb *req, bool force_nonblock,
 	if (!force_nonblock)
 		req->rw.kiocb.ki_flags &= ~IOCB_NOWAIT;
 
-	req->result = 0;
 	io_size = ret;
-	if (req->flags & REQ_F_LINK_HEAD)
-		req->result = io_size;
+	req->result = io_size;
 
 	/* If the file doesn't support async, just async punt */
 	if (force_nonblock && !io_file_supports_async(req->file, WRITE))

From 3adfecaa647ff8afa4b6f5907193cf751a0f8351 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Sun, 28 Jun 2020 12:52:37 +0300
Subject: [PATCH 067/502] io_uring: do task_work_run() during iopoll

There are a lot of new users of task_work, and some of task_work_add()
may happen while we do io polling, thus make iopoll from time to time
to do task_work_run(), so it doesn't poll for sitting there reqs.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index f283d111666b..c514a5209703 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -2052,6 +2052,8 @@ static int io_iopoll_check(struct io_ring_ctx *ctx, unsigned *nr_events,
 		 */
 		if (!(++iters & 7)) {
 			mutex_unlock(&ctx->uring_lock);
+			if (current->task_works)
+				task_work_run();
 			mutex_lock(&ctx->uring_lock);
 		}
 

From f3a6fa2267480d7f19fbde8316372be46055e548 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Sun, 28 Jun 2020 12:52:38 +0300
Subject: [PATCH 068/502] io_uring: fix iopoll -EAGAIN handling

req->iopoll() is not necessarily called by a task that submitted a
request. Because of that, it's dangerous to grab_env() and punt async on
-EGAIN, potentially grabbing another task's mm and corrupting its
memory.

Do resubmit from the submitter task context.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 11 ++++-------
 1 file changed, 4 insertions(+), 7 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index c514a5209703..9d3d8d3866cc 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -892,6 +892,7 @@ enum io_mem_account {
 	ACCT_PINNED,
 };
 
+static bool io_rw_reissue(struct io_kiocb *req, long res);
 static void io_cqring_fill_event(struct io_kiocb *req, long res);
 static void io_put_req(struct io_kiocb *req);
 static void io_double_put_req(struct io_kiocb *req);
@@ -1873,14 +1874,9 @@ static void io_iopoll_queue(struct list_head *again)
 		req = list_first_entry(again, struct io_kiocb, list);
 		list_del(&req->list);
 
-		/* shouldn't happen unless io_uring is dying, cancel reqs */
-		if (unlikely(!current->mm)) {
+		/* should have ->mm unless io_uring is dying, kill reqs then */
+		if (unlikely(!current->mm) || !io_rw_reissue(req, -EAGAIN))
 			io_complete_rw_common(&req->rw.kiocb, -EAGAIN, NULL);
-			continue;
-		}
-
-		refcount_inc(&req->refs);
-		io_queue_async_work(req);
 	} while (!list_empty(again));
 }
 
@@ -2387,6 +2383,7 @@ static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe,
 		kiocb->ki_flags |= IOCB_HIPRI;
 		kiocb->ki_complete = io_complete_rw_iopoll;
 		req->iopoll_completed = 0;
+		io_get_req_task(req);
 	} else {
 		if (kiocb->ki_flags & IOCB_HIPRI)
 			return -EINVAL;

From fb49278624f75e15d36c3c43d322ca8961fb40e9 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Mon, 29 Jun 2020 12:59:48 +0300
Subject: [PATCH 069/502] io_uring: fix missing wake_up io_rw_reissue()

Don't forget to wake up a process to which io_rw_reissue() added
task_work.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 9d3d8d3866cc..92c7e2a96912 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -2168,8 +2168,10 @@ static bool io_rw_reissue(struct io_kiocb *req, long res)
 	tsk = req->task;
 	init_task_work(&req->task_work, io_rw_resubmit);
 	ret = task_work_add(tsk, &req->task_work, true);
-	if (!ret)
+	if (!ret) {
+		wake_up_process(tsk);
 		return true;
+	}
 #endif
 	return false;
 }

From 0188d08a46ffe4a39c6b463451a41d8b503d04d6 Mon Sep 17 00:00:00 2001
From: Sven Schnelle <svens@linux.ibm.com>
Date: Fri, 12 Jun 2020 13:06:07 +0200
Subject: [PATCH 070/502] s390: convert to msecs_to_jiffies()

Instead of using the old 'jiffies + HZ {/,*} something' calculation
use msecs_to_jiffies() as that makes the code more readable.

Signed-off-by: Sven Schnelle <svens@linux.ibm.com>
Reviewed-by: Heiko Carstens <heiko.carstens@de.ibm.com>
Signed-off-by: Heiko Carstens <heiko.carstens@de.ibm.com>
---
 arch/s390/kernel/lgr.c      | 2 +-
 arch/s390/kernel/time.c     | 2 +-
 arch/s390/kernel/topology.c | 4 ++--
 arch/s390/mm/cmm.c          | 2 +-
 4 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/arch/s390/kernel/lgr.c b/arch/s390/kernel/lgr.c
index 452502f9a0d9..3b895971c3d0 100644
--- a/arch/s390/kernel/lgr.c
+++ b/arch/s390/kernel/lgr.c
@@ -167,7 +167,7 @@ static struct timer_list lgr_timer;
  */
 static void lgr_timer_set(void)
 {
-	mod_timer(&lgr_timer, jiffies + LGR_TIMER_INTERVAL_SECS * HZ);
+	mod_timer(&lgr_timer, jiffies + msecs_to_jiffies(LGR_TIMER_INTERVAL_SECS * MSEC_PER_SEC));
 }
 
 /*
diff --git a/arch/s390/kernel/time.c b/arch/s390/kernel/time.c
index b1113b519432..6bc20861fff9 100644
--- a/arch/s390/kernel/time.c
+++ b/arch/s390/kernel/time.c
@@ -669,7 +669,7 @@ static void stp_work_fn(struct work_struct *work)
 		 * There is a usable clock but the synchonization failed.
 		 * Retry after a second.
 		 */
-		mod_timer(&stp_timer, jiffies + HZ);
+		mod_timer(&stp_timer, jiffies + msecs_to_jiffies(MSEC_PER_SEC));
 
 out_unlock:
 	mutex_unlock(&stp_work_mutex);
diff --git a/arch/s390/kernel/topology.c b/arch/s390/kernel/topology.c
index 332b542548cd..ca47141a5be9 100644
--- a/arch/s390/kernel/topology.c
+++ b/arch/s390/kernel/topology.c
@@ -356,9 +356,9 @@ static atomic_t topology_poll = ATOMIC_INIT(0);
 static void set_topology_timer(void)
 {
 	if (atomic_add_unless(&topology_poll, -1, 0))
-		mod_timer(&topology_timer, jiffies + HZ / 10);
+		mod_timer(&topology_timer, jiffies + msecs_to_jiffies(100));
 	else
-		mod_timer(&topology_timer, jiffies + HZ * 60);
+		mod_timer(&topology_timer, jiffies + msecs_to_jiffies(60 * MSEC_PER_SEC));
 }
 
 void topology_expect_change(void)
diff --git a/arch/s390/mm/cmm.c b/arch/s390/mm/cmm.c
index 36bce727897b..5c15ae3daf71 100644
--- a/arch/s390/mm/cmm.c
+++ b/arch/s390/mm/cmm.c
@@ -189,7 +189,7 @@ static void cmm_set_timer(void)
 			del_timer(&cmm_timer);
 		return;
 	}
-	mod_timer(&cmm_timer, jiffies + cmm_timeout_seconds * HZ);
+	mod_timer(&cmm_timer, jiffies + msecs_to_jiffies(cmm_timeout_seconds * MSEC_PER_SEC));
 }
 
 static void cmm_timer_fn(struct timer_list *unused)

From b39e7724b0c28d569e9bd7e95f1b839f64e154bd Mon Sep 17 00:00:00 2001
From: Alexander Egorenkov <egorenar@linux.ibm.com>
Date: Fri, 19 Jun 2020 10:38:46 +0200
Subject: [PATCH 071/502] s390/zcore: remove memmap device

Remove unused /sys/kernel/debug/zcore/memmap device.
Since at least version 1.24.0 of s390-tools zfcpdump no longer
needs it and reads /proc/vmcore instead.

Signed-off-by: Alexander Egorenkov <egorenar@linux.ibm.com>
Reviewed-by: Philipp Rudo <prudo@linux.ibm.com>
Signed-off-by: Heiko Carstens <heiko.carstens@de.ibm.com>
---
 drivers/s390/char/zcore.c | 57 ++-------------------------------------
 1 file changed, 2 insertions(+), 55 deletions(-)

diff --git a/drivers/s390/char/zcore.c b/drivers/s390/char/zcore.c
index 08f812475f5e..d29f1b71618e 100644
--- a/drivers/s390/char/zcore.c
+++ b/drivers/s390/char/zcore.c
@@ -1,8 +1,7 @@
 // SPDX-License-Identifier: GPL-1.0+
 /*
  * zcore module to export memory content and register sets for creating system
- * dumps on SCSI disks (zfcpdump). The "zcore/mem" debugfs file shows the same
- * dump format as s390 standalone dumps.
+ * dumps on SCSI disks (zfcpdump).
  *
  * For more information please refer to Documentation/s390/zfcpdump.rst
  *
@@ -16,7 +15,6 @@
 #include <linux/init.h>
 #include <linux/slab.h>
 #include <linux/debugfs.h>
-#include <linux/memblock.h>
 
 #include <asm/asm-offsets.h>
 #include <asm/ipl.h>
@@ -33,8 +31,6 @@
 
 #define TRACE(x...) debug_sprintf_event(zcore_dbf, 1, x)
 
-#define CHUNK_INFO_SIZE	34 /* 2 16-byte char, each followed by blank */
-
 enum arch_id {
 	ARCH_S390	= 0,
 	ARCH_S390X	= 1,
@@ -48,7 +44,6 @@ struct ipib_info {
 static struct debug_info *zcore_dbf;
 static int hsa_available;
 static struct dentry *zcore_dir;
-static struct dentry *zcore_memmap_file;
 static struct dentry *zcore_reipl_file;
 static struct dentry *zcore_hsa_file;
 static struct ipl_parameter_block *zcore_ipl_block;
@@ -139,46 +134,6 @@ static void release_hsa(void)
 	hsa_available = 0;
 }
 
-static ssize_t zcore_memmap_read(struct file *filp, char __user *buf,
-				 size_t count, loff_t *ppos)
-{
-	return simple_read_from_buffer(buf, count, ppos, filp->private_data,
-				       memblock.memory.cnt * CHUNK_INFO_SIZE);
-}
-
-static int zcore_memmap_open(struct inode *inode, struct file *filp)
-{
-	struct memblock_region *reg;
-	char *buf;
-	int i = 0;
-
-	buf = kcalloc(memblock.memory.cnt, CHUNK_INFO_SIZE, GFP_KERNEL);
-	if (!buf) {
-		return -ENOMEM;
-	}
-	for_each_memblock(memory, reg) {
-		sprintf(buf + (i++ * CHUNK_INFO_SIZE), "%016llx %016llx ",
-			(unsigned long long) reg->base,
-			(unsigned long long) reg->size);
-	}
-	filp->private_data = buf;
-	return nonseekable_open(inode, filp);
-}
-
-static int zcore_memmap_release(struct inode *inode, struct file *filp)
-{
-	kfree(filp->private_data);
-	return 0;
-}
-
-static const struct file_operations zcore_memmap_fops = {
-	.owner		= THIS_MODULE,
-	.read		= zcore_memmap_read,
-	.open		= zcore_memmap_open,
-	.release	= zcore_memmap_release,
-	.llseek		= no_llseek,
-};
-
 static ssize_t zcore_reipl_write(struct file *filp, const char __user *buf,
 				 size_t count, loff_t *ppos)
 {
@@ -335,17 +290,11 @@ static int __init zcore_init(void)
 		rc = -ENOMEM;
 		goto fail;
 	}
-	zcore_memmap_file = debugfs_create_file("memmap", S_IRUSR, zcore_dir,
-						NULL, &zcore_memmap_fops);
-	if (!zcore_memmap_file) {
-		rc = -ENOMEM;
-		goto fail_dir;
-	}
 	zcore_reipl_file = debugfs_create_file("reipl", S_IRUSR, zcore_dir,
 						NULL, &zcore_reipl_fops);
 	if (!zcore_reipl_file) {
 		rc = -ENOMEM;
-		goto fail_memmap_file;
+		goto fail_dir;
 	}
 	zcore_hsa_file = debugfs_create_file("hsa", S_IRUSR|S_IWUSR, zcore_dir,
 					     NULL, &zcore_hsa_fops);
@@ -357,8 +306,6 @@ static int __init zcore_init(void)
 
 fail_reipl_file:
 	debugfs_remove(zcore_reipl_file);
-fail_memmap_file:
-	debugfs_remove(zcore_memmap_file);
 fail_dir:
 	debugfs_remove(zcore_dir);
 fail:

From 90ce70f06546e646713d036cfdec39427df296f7 Mon Sep 17 00:00:00 2001
From: Sven Schnelle <svens@linux.ibm.com>
Date: Tue, 12 May 2020 09:54:58 +0200
Subject: [PATCH 072/502] s390/pci: remove unused functions

Signed-off-by: Sven Schnelle <svens@linux.ibm.com>
Acked-by: Niklas Schnelle <schnelle@linux.ibm.com>
Signed-off-by: Heiko Carstens <heiko.carstens@de.ibm.com>
---
 arch/s390/include/asm/pci_dma.h | 11 -----------
 1 file changed, 11 deletions(-)

diff --git a/arch/s390/include/asm/pci_dma.h b/arch/s390/include/asm/pci_dma.h
index 419fac7a62c0..f62cd3ed2d44 100644
--- a/arch/s390/include/asm/pci_dma.h
+++ b/arch/s390/include/asm/pci_dma.h
@@ -131,12 +131,6 @@ static inline void validate_st_entry(unsigned long *entry)
 	*entry |= ZPCI_TABLE_VALID;
 }
 
-static inline void invalidate_table_entry(unsigned long *entry)
-{
-	*entry &= ~ZPCI_TABLE_VALID_MASK;
-	*entry |= ZPCI_TABLE_INVALID;
-}
-
 static inline void invalidate_pt_entry(unsigned long *entry)
 {
 	WARN_ON_ONCE((*entry & ZPCI_PTE_VALID_MASK) == ZPCI_PTE_INVALID);
@@ -173,11 +167,6 @@ static inline int pt_entry_isvalid(unsigned long entry)
 	return (entry & ZPCI_PTE_VALID_MASK) == ZPCI_PTE_VALID;
 }
 
-static inline int entry_isprotected(unsigned long entry)
-{
-	return (entry & ZPCI_TABLE_PROT_MASK) == ZPCI_TABLE_PROTECTED;
-}
-
 static inline unsigned long *get_rt_sto(unsigned long entry)
 {
 	return ((entry & ZPCI_TABLE_TYPE_MASK) == ZPCI_TABLE_TYPE_RTX)

From 7fa0d6ff35cfaae9cc7012d9220cd24400c650f1 Mon Sep 17 00:00:00 2001
From: Sven Schnelle <svens@linux.ibm.com>
Date: Tue, 12 May 2020 09:55:18 +0200
Subject: [PATCH 073/502] s390/time: remove unused function

Signed-off-by: Sven Schnelle <svens@linux.ibm.com>
Acked-by: Heiko Carstens <heiko.carstens@de.ibm.com>
Signed-off-by: Heiko Carstens <heiko.carstens@de.ibm.com>
---
 arch/s390/include/asm/timex.h | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/arch/s390/include/asm/timex.h b/arch/s390/include/asm/timex.h
index 6bf3a45ccfec..289aaff4d365 100644
--- a/arch/s390/include/asm/timex.h
+++ b/arch/s390/include/asm/timex.h
@@ -49,11 +49,6 @@ static inline void set_clock_comparator(__u64 time)
 	asm volatile("sckc %0" : : "Q" (time));
 }
 
-static inline void store_clock_comparator(__u64 *time)
-{
-	asm volatile("stckc %0" : "=Q" (*time));
-}
-
 void clock_comparator_work(void);
 
 void __init time_early_init(void);

From ecb1ff6833c461ea3bcf16396cd4f1eb50b119c2 Mon Sep 17 00:00:00 2001
From: Heiko Carstens <heiko.carstens@de.ibm.com>
Date: Thu, 18 Jun 2020 07:09:57 +0200
Subject: [PATCH 074/502] s390/debug: remove raw view

There is not a single user of the debug raw view. Therefore remove it
before anybody uses it. If anybody would make use of the view it would
expose the struct __debug_entry definition to userspace and really
would make it uapi. This wouldn't be good, since the definition is
suboptimal and needs to be changed.

Right now the structure definition is only defined to be uapi, however
there is no user.

Signed-off-by: Heiko Carstens <heiko.carstens@de.ibm.com>
---
 Documentation/s390/s390dbf.rst | 17 ++++----------
 arch/s390/include/asm/debug.h  |  1 -
 arch/s390/kernel/debug.c       | 42 ----------------------------------
 3 files changed, 4 insertions(+), 56 deletions(-)

diff --git a/Documentation/s390/s390dbf.rst b/Documentation/s390/s390dbf.rst
index cdb36842b898..af8bdc3629e7 100644
--- a/Documentation/s390/s390dbf.rst
+++ b/Documentation/s390/s390dbf.rst
@@ -67,7 +67,7 @@ corresponding component. The debugfs normally should be mounted to
 The content of the directories are files which represent different views
 to the debug log. Each component can decide which views should be
 used through registering them with the function :c:func:`debug_register_view()`.
-Predefined views for hex/ascii, sprintf and raw binary data are provided.
+Predefined views for hex/ascii and sprintf data are provided.
 It is also possible to define other views. The content of
 a view can be inspected simply by reading the corresponding debugfs file.
 
@@ -119,8 +119,6 @@ Predefined views:
 
   extern struct debug_view debug_hex_ascii_view;
 
-  extern struct debug_view debug_raw_view;
-
   extern struct debug_view debug_sprintf_view;
 
 Examples
@@ -129,7 +127,7 @@ Examples
 .. code-block:: c
 
   /*
-   * hex_ascii- + raw-view Example
+   * hex_ascii-view Example
    */
 
   #include <linux/init.h>
@@ -143,7 +141,6 @@ Examples
 
       debug_info = debug_register("test", 1, 4, 4 );
       debug_register_view(debug_info, &debug_hex_ascii_view);
-      debug_register_view(debug_info, &debug_raw_view);
 
       debug_text_event(debug_info, 4 , "one ");
       debug_int_exception(debug_info, 4, 4711);
@@ -201,7 +198,7 @@ debugfs-files:
 Example::
 
   > ls /sys/kernel/debug/s390dbf/dasd
-  flush  hex_ascii  level pages raw
+  flush  hex_ascii  level pages
   > cat /sys/kernel/debug/s390dbf/dasd/hex_ascii | sort -k2,2 -s
   00 00974733272:680099 2 - 02 0006ad7e  07 ea 4a 90 | ....
   00 00974733272:682210 2 - 02 0006ade6  46 52 45 45 | FREE
@@ -298,10 +295,9 @@ order to see the debug entries well formatted.
 Predefined Views
 ----------------
 
-There are three predefined views: hex_ascii, raw and sprintf.
+There are two predefined views: hex_ascii and sprintf.
 The hex_ascii view shows the data field in hex and ascii representation
 (e.g. ``45 43 4b 44 | ECKD``).
-The raw view returns a bytestream as the debug areas are stored in memory.
 
 The sprintf view formats the debug entries in the same way as the sprintf
 function would do. The sprintf event/exception functions write to the
@@ -334,11 +330,6 @@ The format of the hex_ascii and sprintf view is as follows:
 - Return Address to caller
 - data field
 
-The format of the raw view is:
-
-- Header as described in debug.h
-- datafield
-
 A typical line of the hex_ascii view will look like the following (first line
 is only for explanation and will not be displayed when 'cating' the view)::
 
diff --git a/arch/s390/include/asm/debug.h b/arch/s390/include/asm/debug.h
index 310134015541..d39da8f3130e 100644
--- a/arch/s390/include/asm/debug.h
+++ b/arch/s390/include/asm/debug.h
@@ -82,7 +82,6 @@ struct debug_view {
 };
 
 extern struct debug_view debug_hex_ascii_view;
-extern struct debug_view debug_raw_view;
 extern struct debug_view debug_sprintf_view;
 
 /* do NOT use the _common functions */
diff --git a/arch/s390/kernel/debug.c b/arch/s390/kernel/debug.c
index 263075a1af36..beb4b44a11d1 100644
--- a/arch/s390/kernel/debug.c
+++ b/arch/s390/kernel/debug.c
@@ -90,27 +90,11 @@ static int debug_input_flush_fn(debug_info_t *id, struct debug_view *view,
 				size_t user_buf_size, loff_t *offset);
 static int debug_hex_ascii_format_fn(debug_info_t *id, struct debug_view *view,
 				     char *out_buf, const char *in_buf);
-static int debug_raw_format_fn(debug_info_t *id,
-			       struct debug_view *view, char *out_buf,
-			       const char *in_buf);
-static int debug_raw_header_fn(debug_info_t *id, struct debug_view *view,
-			       int area, debug_entry_t *entry, char *out_buf);
-
 static int debug_sprintf_format_fn(debug_info_t *id, struct debug_view *view,
 				   char *out_buf, debug_sprintf_entry_t *curr_event);
 
 /* globals */
 
-struct debug_view debug_raw_view = {
-	"raw",
-	NULL,
-	&debug_raw_header_fn,
-	&debug_raw_format_fn,
-	NULL,
-	NULL
-};
-EXPORT_SYMBOL(debug_raw_view);
-
 struct debug_view debug_hex_ascii_view = {
 	"hex_ascii",
 	NULL,
@@ -1385,32 +1369,6 @@ out:
 	return rc;		/* number of input characters */
 }
 
-/*
- * prints debug header in raw format
- */
-static int debug_raw_header_fn(debug_info_t *id, struct debug_view *view,
-			       int area, debug_entry_t *entry, char *out_buf)
-{
-	int rc;
-
-	rc = sizeof(debug_entry_t);
-	memcpy(out_buf, entry, sizeof(debug_entry_t));
-	return rc;
-}
-
-/*
- * prints debug data in raw format
- */
-static int debug_raw_format_fn(debug_info_t *id, struct debug_view *view,
-			       char *out_buf, const char *in_buf)
-{
-	int rc;
-
-	rc = id->buf_size;
-	memcpy(out_buf, in_buf, id->buf_size);
-	return rc;
-}
-
 /*
  * prints debug data in hex/ascii format
  */

From 6ffb3f6b46d0d02c318946047dc5ce6553495848 Mon Sep 17 00:00:00 2001
From: Heiko Carstens <heiko.carstens@de.ibm.com>
Date: Thu, 18 Jun 2020 07:41:18 +0200
Subject: [PATCH 075/502] s390/debug: remove struct __debug_entry from uapi

There is no interface to userspace which exposes anything that would
require the struct __debug_entry definition. Therefore remove it from
uapi. This allows to change the definition, since it is only kernel
internally used.

The only exception is the crash utility, however that tool must handle
changes all the time anyway.

Signed-off-by: Heiko Carstens <heiko.carstens@de.ibm.com>
---
 arch/s390/include/asm/debug.h      | 17 ++++++++++++++-
 arch/s390/include/uapi/asm/debug.h | 35 ------------------------------
 2 files changed, 16 insertions(+), 36 deletions(-)
 delete mode 100644 arch/s390/include/uapi/asm/debug.h

diff --git a/arch/s390/include/asm/debug.h b/arch/s390/include/asm/debug.h
index d39da8f3130e..17a26261f288 100644
--- a/arch/s390/include/asm/debug.h
+++ b/arch/s390/include/asm/debug.h
@@ -12,7 +12,7 @@
 #include <linux/kernel.h>
 #include <linux/time.h>
 #include <linux/refcount.h>
-#include <uapi/asm/debug.h>
+#include <linux/fs.h>
 
 #define DEBUG_MAX_LEVEL		   6  /* debug levels range from 0 to 6 */
 #define DEBUG_OFF_LEVEL		   -1 /* level where debug is switched off */
@@ -26,6 +26,21 @@
 #define DEBUG_DATA(entry) (char *)(entry + 1) /* data is stored behind */
 					      /* the entry information */
 
+#define __DEBUG_FEATURE_VERSION	   2  /* version of debug feature */
+
+struct __debug_entry {
+	union {
+		struct {
+			unsigned long clock	: 52;
+			unsigned long exception	:  1;
+			unsigned long level	:  3;
+			unsigned long cpuid	:  8;
+		} fields;
+		unsigned long stck;
+	} id;
+	void *caller;
+} __packed;
+
 typedef struct __debug_entry debug_entry_t;
 
 struct debug_view;
diff --git a/arch/s390/include/uapi/asm/debug.h b/arch/s390/include/uapi/asm/debug.h
deleted file mode 100644
index c7c564d9aea4..000000000000
--- a/arch/s390/include/uapi/asm/debug.h
+++ /dev/null
@@ -1,35 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
-/*
- *   S/390 debug facility
- *
- *    Copyright IBM Corp. 1999, 2000
- */
-
-#ifndef _UAPIDEBUG_H
-#define _UAPIDEBUG_H
-
-#include <linux/fs.h>
-
-/* Note:
- * struct __debug_entry must be defined outside of #ifdef __KERNEL__ 
- * in order to allow a user program to analyze the 'raw'-view.
- */
-
-struct __debug_entry{
-        union {
-                struct {
-                        unsigned long long clock:52;
-                        unsigned long long exception:1;
-                        unsigned long long level:3;
-                        unsigned long long cpuid:8;
-                } fields;
-
-                unsigned long long stck;
-        } id;
-        void* caller;
-} __attribute__((packed));
-
-
-#define __DEBUG_FEATURE_VERSION      2  /* version of debug feature */
-
-#endif /* _UAPIDEBUG_H */

From 28ccce5f50af2e9484d6b74b22ff9eb54bb775a2 Mon Sep 17 00:00:00 2001
From: "Gustavo A. R. Silva" <gustavoars@kernel.org>
Date: Wed, 17 Jun 2020 16:29:30 -0500
Subject: [PATCH 076/502] s390/appldata: use struct_size() helper

Make use of the struct_size() helper instead of an open-coded version
in order to avoid any potential type mistakes.

This code was detected with the help of Coccinelle and, audited and
fixed manually.

Signed-off-by: Gustavo A. R. Silva <gustavoars@kernel.org>
Message-Id: <20200617212930.GA11728@embeddedor>
Signed-off-by: Heiko Carstens <heiko.carstens@de.ibm.com>
---
 arch/s390/appldata/appldata_os.c | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/arch/s390/appldata/appldata_os.c b/arch/s390/appldata/appldata_os.c
index 5503217366ec..a363d30ce739 100644
--- a/arch/s390/appldata/appldata_os.c
+++ b/arch/s390/appldata/appldata_os.c
@@ -129,8 +129,7 @@ static void appldata_get_os_data(void *data)
 
 	os_data->nr_cpus = j;
 
-	new_size = sizeof(struct appldata_os_data) +
-		   (os_data->nr_cpus * sizeof(struct appldata_os_per_cpu));
+	new_size = struct_size(os_data, os_cpu, os_data->nr_cpus);
 	if (ops.size != new_size) {
 		if (ops.active) {
 			rc = appldata_diag(APPLDATA_RECORD_OS_ID,
@@ -165,8 +164,7 @@ static int __init appldata_os_init(void)
 {
 	int rc, max_size;
 
-	max_size = sizeof(struct appldata_os_data) +
-		   (num_possible_cpus() * sizeof(struct appldata_os_per_cpu));
+	max_size = struct_size(appldata_os_data, os_cpu, num_possible_cpus());
 	if (max_size > APPLDATA_MAX_REC_SIZE) {
 		pr_err("Maximum OS record size %i exceeds the maximum "
 		       "record size %i\n", max_size, APPLDATA_MAX_REC_SIZE);

From 6b05dfacd761c6ace11def4b3b42fc6a7583fec3 Mon Sep 17 00:00:00 2001
From: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
Date: Tue, 21 Apr 2020 19:04:02 +0200
Subject: [PATCH 077/502] docs: RCU: Convert checklist.txt to ReST

- Add a SPDX header;
- Adjust document title;
- Some whitespace fixes and new line breaks;
- Use the right list markups;
- Add it to RCU/index.rst.

Signed-off-by: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 .../RCU/{checklist.txt => checklist.rst}        | 17 ++++++++++++-----
 Documentation/RCU/index.rst                     |  3 +++
 2 files changed, 15 insertions(+), 5 deletions(-)
 rename Documentation/RCU/{checklist.txt => checklist.rst} (98%)

diff --git a/Documentation/RCU/checklist.txt b/Documentation/RCU/checklist.rst
similarity index 98%
rename from Documentation/RCU/checklist.txt
rename to Documentation/RCU/checklist.rst
index e98ff261a438..2efed9926c3f 100644
--- a/Documentation/RCU/checklist.txt
+++ b/Documentation/RCU/checklist.rst
@@ -1,4 +1,8 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+================================
 Review Checklist for RCU Patches
+================================
 
 
 This document contains a checklist for producing and reviewing patches
@@ -411,18 +415,21 @@ over a rather long period of time, but improvements are always welcome!
 	__rcu sparse checks to validate your RCU code.	These can help
 	find problems as follows:
 
-	CONFIG_PROVE_LOCKING: check that accesses to RCU-protected data
+	CONFIG_PROVE_LOCKING:
+		check that accesses to RCU-protected data
 		structures are carried out under the proper RCU
 		read-side critical section, while holding the right
 		combination of locks, or whatever other conditions
 		are appropriate.
 
-	CONFIG_DEBUG_OBJECTS_RCU_HEAD: check that you don't pass the
+	CONFIG_DEBUG_OBJECTS_RCU_HEAD:
+		check that you don't pass the
 		same object to call_rcu() (or friends) before an RCU
 		grace period has elapsed since the last time that you
 		passed that same object to call_rcu() (or friends).
 
-	__rcu sparse checks: tag the pointer to the RCU-protected data
+	__rcu sparse checks:
+		tag the pointer to the RCU-protected data
 		structure with __rcu, and sparse will warn you if you
 		access that pointer without the services of one of the
 		variants of rcu_dereference().
@@ -442,8 +449,8 @@ over a rather long period of time, but improvements are always welcome!
 
 	You instead need to use one of the barrier functions:
 
-	o	call_rcu() -> rcu_barrier()
-	o	call_srcu() -> srcu_barrier()
+	-	call_rcu() -> rcu_barrier()
+	-	call_srcu() -> srcu_barrier()
 
 	However, these barrier functions are absolutely -not- guaranteed
 	to wait for a grace period.  In fact, if there are no call_rcu()
diff --git a/Documentation/RCU/index.rst b/Documentation/RCU/index.rst
index 81a0a1e5f767..c1ba4d130bb0 100644
--- a/Documentation/RCU/index.rst
+++ b/Documentation/RCU/index.rst
@@ -1,3 +1,5 @@
+.. SPDX-License-Identifier: GPL-2.0
+
 .. _rcu_concepts:
 
 ============
@@ -8,6 +10,7 @@ RCU concepts
    :maxdepth: 3
 
    arrayRCU
+   checklist
    rcubarrier
    rcu_dereference
    whatisRCU

From a3b0a79f8903f955250505f99d1e37b6c7d7b060 Mon Sep 17 00:00:00 2001
From: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
Date: Tue, 21 Apr 2020 19:04:03 +0200
Subject: [PATCH 078/502] docs: RCU: Convert lockdep-splat.txt to ReST

- Add a SPDX header;
- Add a document title;
- Some whitespace fixes and new line breaks;
- Mark literal blocks as such;
- Add it to RCU/index.rst.

Signed-off-by: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 Documentation/RCU/index.rst                   |  1 +
 .../{lockdep-splat.txt => lockdep-splat.rst}  | 99 ++++++++++---------
 2 files changed, 53 insertions(+), 47 deletions(-)
 rename Documentation/RCU/{lockdep-splat.txt => lockdep-splat.rst} (54%)

diff --git a/Documentation/RCU/index.rst b/Documentation/RCU/index.rst
index c1ba4d130bb0..430a37132b2c 100644
--- a/Documentation/RCU/index.rst
+++ b/Documentation/RCU/index.rst
@@ -11,6 +11,7 @@ RCU concepts
 
    arrayRCU
    checklist
+   lockdep-splat
    rcubarrier
    rcu_dereference
    whatisRCU
diff --git a/Documentation/RCU/lockdep-splat.txt b/Documentation/RCU/lockdep-splat.rst
similarity index 54%
rename from Documentation/RCU/lockdep-splat.txt
rename to Documentation/RCU/lockdep-splat.rst
index b8096316fd11..2a5c79db57dc 100644
--- a/Documentation/RCU/lockdep-splat.txt
+++ b/Documentation/RCU/lockdep-splat.rst
@@ -1,3 +1,9 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+=================
+Lockdep-RCU Splat
+=================
+
 Lockdep-RCU was added to the Linux kernel in early 2010
 (http://lwn.net/Articles/371986/).  This facility checks for some common
 misuses of the RCU API, most notably using one of the rcu_dereference()
@@ -12,55 +18,54 @@ overwriting or worse.  There can of course be false positives, this
 being the real world and all that.
 
 So let's look at an example RCU lockdep splat from 3.0-rc5, one that
-has long since been fixed:
+has long since been fixed::
 
-=============================
-WARNING: suspicious RCU usage
------------------------------
-block/cfq-iosched.c:2776 suspicious rcu_dereference_protected() usage!
+    =============================
+    WARNING: suspicious RCU usage
+    -----------------------------
+    block/cfq-iosched.c:2776 suspicious rcu_dereference_protected() usage!
 
-other info that might help us debug this:
+other info that might help us debug this::
 
+    rcu_scheduler_active = 1, debug_locks = 0
+    3 locks held by scsi_scan_6/1552:
+    #0:  (&shost->scan_mutex){+.+.}, at: [<ffffffff8145efca>]
+    scsi_scan_host_selected+0x5a/0x150
+    #1:  (&eq->sysfs_lock){+.+.}, at: [<ffffffff812a5032>]
+    elevator_exit+0x22/0x60
+    #2:  (&(&q->__queue_lock)->rlock){-.-.}, at: [<ffffffff812b6233>]
+    cfq_exit_queue+0x43/0x190
 
-rcu_scheduler_active = 1, debug_locks = 0
-3 locks held by scsi_scan_6/1552:
- #0:  (&shost->scan_mutex){+.+.}, at: [<ffffffff8145efca>]
-scsi_scan_host_selected+0x5a/0x150
- #1:  (&eq->sysfs_lock){+.+.}, at: [<ffffffff812a5032>]
-elevator_exit+0x22/0x60
- #2:  (&(&q->__queue_lock)->rlock){-.-.}, at: [<ffffffff812b6233>]
-cfq_exit_queue+0x43/0x190
+    stack backtrace:
+    Pid: 1552, comm: scsi_scan_6 Not tainted 3.0.0-rc5 #17
+    Call Trace:
+    [<ffffffff810abb9b>] lockdep_rcu_dereference+0xbb/0xc0
+    [<ffffffff812b6139>] __cfq_exit_single_io_context+0xe9/0x120
+    [<ffffffff812b626c>] cfq_exit_queue+0x7c/0x190
+    [<ffffffff812a5046>] elevator_exit+0x36/0x60
+    [<ffffffff812a802a>] blk_cleanup_queue+0x4a/0x60
+    [<ffffffff8145cc09>] scsi_free_queue+0x9/0x10
+    [<ffffffff81460944>] __scsi_remove_device+0x84/0xd0
+    [<ffffffff8145dca3>] scsi_probe_and_add_lun+0x353/0xb10
+    [<ffffffff817da069>] ? error_exit+0x29/0xb0
+    [<ffffffff817d98ed>] ? _raw_spin_unlock_irqrestore+0x3d/0x80
+    [<ffffffff8145e722>] __scsi_scan_target+0x112/0x680
+    [<ffffffff812c690d>] ? trace_hardirqs_off_thunk+0x3a/0x3c
+    [<ffffffff817da069>] ? error_exit+0x29/0xb0
+    [<ffffffff812bcc60>] ? kobject_del+0x40/0x40
+    [<ffffffff8145ed16>] scsi_scan_channel+0x86/0xb0
+    [<ffffffff8145f0b0>] scsi_scan_host_selected+0x140/0x150
+    [<ffffffff8145f149>] do_scsi_scan_host+0x89/0x90
+    [<ffffffff8145f170>] do_scan_async+0x20/0x160
+    [<ffffffff8145f150>] ? do_scsi_scan_host+0x90/0x90
+    [<ffffffff810975b6>] kthread+0xa6/0xb0
+    [<ffffffff817db154>] kernel_thread_helper+0x4/0x10
+    [<ffffffff81066430>] ? finish_task_switch+0x80/0x110
+    [<ffffffff817d9c04>] ? retint_restore_args+0xe/0xe
+    [<ffffffff81097510>] ? __kthread_init_worker+0x70/0x70
+    [<ffffffff817db150>] ? gs_change+0xb/0xb
 
-stack backtrace:
-Pid: 1552, comm: scsi_scan_6 Not tainted 3.0.0-rc5 #17
-Call Trace:
- [<ffffffff810abb9b>] lockdep_rcu_dereference+0xbb/0xc0
- [<ffffffff812b6139>] __cfq_exit_single_io_context+0xe9/0x120
- [<ffffffff812b626c>] cfq_exit_queue+0x7c/0x190
- [<ffffffff812a5046>] elevator_exit+0x36/0x60
- [<ffffffff812a802a>] blk_cleanup_queue+0x4a/0x60
- [<ffffffff8145cc09>] scsi_free_queue+0x9/0x10
- [<ffffffff81460944>] __scsi_remove_device+0x84/0xd0
- [<ffffffff8145dca3>] scsi_probe_and_add_lun+0x353/0xb10
- [<ffffffff817da069>] ? error_exit+0x29/0xb0
- [<ffffffff817d98ed>] ? _raw_spin_unlock_irqrestore+0x3d/0x80
- [<ffffffff8145e722>] __scsi_scan_target+0x112/0x680
- [<ffffffff812c690d>] ? trace_hardirqs_off_thunk+0x3a/0x3c
- [<ffffffff817da069>] ? error_exit+0x29/0xb0
- [<ffffffff812bcc60>] ? kobject_del+0x40/0x40
- [<ffffffff8145ed16>] scsi_scan_channel+0x86/0xb0
- [<ffffffff8145f0b0>] scsi_scan_host_selected+0x140/0x150
- [<ffffffff8145f149>] do_scsi_scan_host+0x89/0x90
- [<ffffffff8145f170>] do_scan_async+0x20/0x160
- [<ffffffff8145f150>] ? do_scsi_scan_host+0x90/0x90
- [<ffffffff810975b6>] kthread+0xa6/0xb0
- [<ffffffff817db154>] kernel_thread_helper+0x4/0x10
- [<ffffffff81066430>] ? finish_task_switch+0x80/0x110
- [<ffffffff817d9c04>] ? retint_restore_args+0xe/0xe
- [<ffffffff81097510>] ? __kthread_init_worker+0x70/0x70
- [<ffffffff817db150>] ? gs_change+0xb/0xb
-
-Line 2776 of block/cfq-iosched.c in v3.0-rc5 is as follows:
+Line 2776 of block/cfq-iosched.c in v3.0-rc5 is as follows::
 
 	if (rcu_dereference(ioc->ioc_data) == cic) {
 
@@ -70,7 +75,7 @@ case.  Instead, we hold three locks, one of which might be RCU related.
 And maybe that lock really does protect this reference.  If so, the fix
 is to inform RCU, perhaps by changing __cfq_exit_single_io_context() to
 take the struct request_queue "q" from cfq_exit_queue() as an argument,
-which would permit us to invoke rcu_dereference_protected as follows:
+which would permit us to invoke rcu_dereference_protected as follows::
 
 	if (rcu_dereference_protected(ioc->ioc_data,
 				      lockdep_is_held(&q->queue_lock)) == cic) {
@@ -85,7 +90,7 @@ On the other hand, perhaps we really do need an RCU read-side critical
 section.  In this case, the critical section must span the use of the
 return value from rcu_dereference(), or at least until there is some
 reference count incremented or some such.  One way to handle this is to
-add rcu_read_lock() and rcu_read_unlock() as follows:
+add rcu_read_lock() and rcu_read_unlock() as follows::
 
 	rcu_read_lock();
 	if (rcu_dereference(ioc->ioc_data) == cic) {
@@ -102,7 +107,7 @@ above lockdep-RCU splat.
 But in this particular case, we don't actually dereference the pointer
 returned from rcu_dereference().  Instead, that pointer is just compared
 to the cic pointer, which means that the rcu_dereference() can be replaced
-by rcu_access_pointer() as follows:
+by rcu_access_pointer() as follows::
 
 	if (rcu_access_pointer(ioc->ioc_data) == cic) {
 

From 058cc23bcad08aca62987cc795fe406ac39146d0 Mon Sep 17 00:00:00 2001
From: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
Date: Tue, 21 Apr 2020 19:04:04 +0200
Subject: [PATCH 079/502] docs: RCU: Convert lockdep.txt to ReST

- Add a SPDX header;
- Adjust document title;
- Mark literal blocks as such;
- Add it to RCU/index.rst.

Signed-off-by: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 Documentation/RCU/index.rst                    |  1 +
 Documentation/RCU/{lockdep.txt => lockdep.rst} | 12 ++++++++----
 2 files changed, 9 insertions(+), 4 deletions(-)
 rename Documentation/RCU/{lockdep.txt => lockdep.rst} (96%)

diff --git a/Documentation/RCU/index.rst b/Documentation/RCU/index.rst
index 430a37132b2c..fa7a2a8949b7 100644
--- a/Documentation/RCU/index.rst
+++ b/Documentation/RCU/index.rst
@@ -11,6 +11,7 @@ RCU concepts
 
    arrayRCU
    checklist
+   lockdep
    lockdep-splat
    rcubarrier
    rcu_dereference
diff --git a/Documentation/RCU/lockdep.txt b/Documentation/RCU/lockdep.rst
similarity index 96%
rename from Documentation/RCU/lockdep.txt
rename to Documentation/RCU/lockdep.rst
index 89db949eeca0..f1fc8ae3846a 100644
--- a/Documentation/RCU/lockdep.txt
+++ b/Documentation/RCU/lockdep.rst
@@ -1,4 +1,8 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+========================
 RCU and lockdep checking
+========================
 
 All flavors of RCU have lockdep checking available, so that lockdep is
 aware of when each task enters and leaves any flavor of RCU read-side
@@ -8,7 +12,7 @@ tracking to include RCU state, which can sometimes help when debugging
 deadlocks and the like.
 
 In addition, RCU provides the following primitives that check lockdep's
-state:
+state::
 
 	rcu_read_lock_held() for normal RCU.
 	rcu_read_lock_bh_held() for RCU-bh.
@@ -63,7 +67,7 @@ checking of rcu_dereference() primitives:
 The rcu_dereference_check() check expression can be any boolean
 expression, but would normally include a lockdep expression.  However,
 any boolean expression can be used.  For a moderately ornate example,
-consider the following:
+consider the following::
 
 	file = rcu_dereference_check(fdt->fd[fd],
 				     lockdep_is_held(&files->file_lock) ||
@@ -82,7 +86,7 @@ RCU read-side critical sections, in case (2) the ->file_lock prevents
 any change from taking place, and finally, in case (3) the current task
 is the only task accessing the file_struct, again preventing any change
 from taking place.  If the above statement was invoked only from updater
-code, it could instead be written as follows:
+code, it could instead be written as follows::
 
 	file = rcu_dereference_protected(fdt->fd[fd],
 					 lockdep_is_held(&files->file_lock) ||
@@ -105,7 +109,7 @@ false and they are called from outside any RCU read-side critical section.
 
 For example, the workqueue for_each_pwq() macro is intended to be used
 either within an RCU read-side critical section or with wq->mutex held.
-It is thus implemented as follows:
+It is thus implemented as follows::
 
 	#define for_each_pwq(pwq, wq)
 		list_for_each_entry_rcu((pwq), &(wq)->pwqs, pwqs_node,

From 2cdb54c93a7e5beb6f3f8b63575d9fb664dfc603 Mon Sep 17 00:00:00 2001
From: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
Date: Tue, 21 Apr 2020 19:04:05 +0200
Subject: [PATCH 080/502] docs: RCU: Convert rculist_nulls.txt to ReST

- Add a SPDX header;
- Adjust document title;
- Some whitespace fixes and new line breaks;
- Mark literal blocks as such;
- Add it to RCU/index.rst.

Signed-off-by: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 Documentation/RCU/index.rst         |   1 +
 Documentation/RCU/rculist_nulls.rst | 194 ++++++++++++++++++++++++++++
 Documentation/RCU/rculist_nulls.txt | 172 ------------------------
 include/linux/rculist_nulls.h       |   2 +-
 net/core/sock.c                     |   4 +-
 5 files changed, 198 insertions(+), 175 deletions(-)
 create mode 100644 Documentation/RCU/rculist_nulls.rst
 delete mode 100644 Documentation/RCU/rculist_nulls.txt

diff --git a/Documentation/RCU/index.rst b/Documentation/RCU/index.rst
index fa7a2a8949b7..577a47e27f5d 100644
--- a/Documentation/RCU/index.rst
+++ b/Documentation/RCU/index.rst
@@ -17,6 +17,7 @@ RCU concepts
    rcu_dereference
    whatisRCU
    rcu
+   rculist_nulls
    listRCU
    NMI-RCU
    UP
diff --git a/Documentation/RCU/rculist_nulls.rst b/Documentation/RCU/rculist_nulls.rst
new file mode 100644
index 000000000000..d40374221d69
--- /dev/null
+++ b/Documentation/RCU/rculist_nulls.rst
@@ -0,0 +1,194 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+=================================================
+Using RCU hlist_nulls to protect list and objects
+=================================================
+
+This section describes how to use hlist_nulls to
+protect read-mostly linked lists and
+objects using SLAB_TYPESAFE_BY_RCU allocations.
+
+Please read the basics in Documentation/RCU/listRCU.rst
+
+Using special makers (called 'nulls') is a convenient way
+to solve following problem :
+
+A typical RCU linked list managing objects which are
+allocated with SLAB_TYPESAFE_BY_RCU kmem_cache can
+use following algos :
+
+1) Lookup algo
+--------------
+
+::
+
+  rcu_read_lock()
+  begin:
+  obj = lockless_lookup(key);
+  if (obj) {
+    if (!try_get_ref(obj)) // might fail for free objects
+      goto begin;
+    /*
+    * Because a writer could delete object, and a writer could
+    * reuse these object before the RCU grace period, we
+    * must check key after getting the reference on object
+    */
+    if (obj->key != key) { // not the object we expected
+      put_ref(obj);
+      goto begin;
+    }
+  }
+  rcu_read_unlock();
+
+Beware that lockless_lookup(key) cannot use traditional hlist_for_each_entry_rcu()
+but a version with an additional memory barrier (smp_rmb())
+
+::
+
+  lockless_lookup(key)
+  {
+    struct hlist_node *node, *next;
+    for (pos = rcu_dereference((head)->first);
+        pos && ({ next = pos->next; smp_rmb(); prefetch(next); 1; }) &&
+        ({ tpos = hlist_entry(pos, typeof(*tpos), member); 1; });
+        pos = rcu_dereference(next))
+      if (obj->key == key)
+        return obj;
+    return NULL;
+  }
+
+And note the traditional hlist_for_each_entry_rcu() misses this smp_rmb()::
+
+  struct hlist_node *node;
+  for (pos = rcu_dereference((head)->first);
+        pos && ({ prefetch(pos->next); 1; }) &&
+        ({ tpos = hlist_entry(pos, typeof(*tpos), member); 1; });
+        pos = rcu_dereference(pos->next))
+   if (obj->key == key)
+     return obj;
+  return NULL;
+
+Quoting Corey Minyard::
+
+  "If the object is moved from one list to another list in-between the
+  time the hash is calculated and the next field is accessed, and the
+  object has moved to the end of a new list, the traversal will not
+  complete properly on the list it should have, since the object will
+  be on the end of the new list and there's not a way to tell it's on a
+  new list and restart the list traversal. I think that this can be
+  solved by pre-fetching the "next" field (with proper barriers) before
+  checking the key."
+
+2) Insert algo
+--------------
+
+We need to make sure a reader cannot read the new 'obj->obj_next' value
+and previous value of 'obj->key'. Or else, an item could be deleted
+from a chain, and inserted into another chain. If new chain was empty
+before the move, 'next' pointer is NULL, and lockless reader can
+not detect it missed following items in original chain.
+
+::
+
+  /*
+  * Please note that new inserts are done at the head of list,
+  * not in the middle or end.
+  */
+  obj = kmem_cache_alloc(...);
+  lock_chain(); // typically a spin_lock()
+  obj->key = key;
+  /*
+  * we need to make sure obj->key is updated before obj->next
+  * or obj->refcnt
+  */
+  smp_wmb();
+  atomic_set(&obj->refcnt, 1);
+  hlist_add_head_rcu(&obj->obj_node, list);
+  unlock_chain(); // typically a spin_unlock()
+
+
+3) Remove algo
+--------------
+Nothing special here, we can use a standard RCU hlist deletion.
+But thanks to SLAB_TYPESAFE_BY_RCU, beware a deleted object can be reused
+very very fast (before the end of RCU grace period)
+
+::
+
+  if (put_last_reference_on(obj) {
+    lock_chain(); // typically a spin_lock()
+    hlist_del_init_rcu(&obj->obj_node);
+    unlock_chain(); // typically a spin_unlock()
+    kmem_cache_free(cachep, obj);
+  }
+
+
+
+--------------------------------------------------------------------------
+
+With hlist_nulls we can avoid extra smp_rmb() in lockless_lookup()
+and extra smp_wmb() in insert function.
+
+For example, if we choose to store the slot number as the 'nulls'
+end-of-list marker for each slot of the hash table, we can detect
+a race (some writer did a delete and/or a move of an object
+to another chain) checking the final 'nulls' value if
+the lookup met the end of chain. If final 'nulls' value
+is not the slot number, then we must restart the lookup at
+the beginning. If the object was moved to the same chain,
+then the reader doesn't care : It might eventually
+scan the list again without harm.
+
+
+1) lookup algo
+--------------
+
+::
+
+  head = &table[slot];
+  rcu_read_lock();
+  begin:
+  hlist_nulls_for_each_entry_rcu(obj, node, head, member) {
+    if (obj->key == key) {
+      if (!try_get_ref(obj)) // might fail for free objects
+        goto begin;
+      if (obj->key != key) { // not the object we expected
+        put_ref(obj);
+        goto begin;
+      }
+    goto out;
+  }
+  /*
+  * if the nulls value we got at the end of this lookup is
+  * not the expected one, we must restart lookup.
+  * We probably met an item that was moved to another chain.
+  */
+  if (get_nulls_value(node) != slot)
+  goto begin;
+  obj = NULL;
+
+  out:
+  rcu_read_unlock();
+
+2) Insert function
+------------------
+
+::
+
+  /*
+  * Please note that new inserts are done at the head of list,
+  * not in the middle or end.
+  */
+  obj = kmem_cache_alloc(cachep);
+  lock_chain(); // typically a spin_lock()
+  obj->key = key;
+  /*
+  * changes to obj->key must be visible before refcnt one
+  */
+  smp_wmb();
+  atomic_set(&obj->refcnt, 1);
+  /*
+  * insert obj in RCU way (readers might be traversing chain)
+  */
+  hlist_nulls_add_head_rcu(&obj->obj_node, list);
+  unlock_chain(); // typically a spin_unlock()
diff --git a/Documentation/RCU/rculist_nulls.txt b/Documentation/RCU/rculist_nulls.txt
deleted file mode 100644
index 23f115dc87cf..000000000000
--- a/Documentation/RCU/rculist_nulls.txt
+++ /dev/null
@@ -1,172 +0,0 @@
-Using hlist_nulls to protect read-mostly linked lists and
-objects using SLAB_TYPESAFE_BY_RCU allocations.
-
-Please read the basics in Documentation/RCU/listRCU.rst
-
-Using special makers (called 'nulls') is a convenient way
-to solve following problem :
-
-A typical RCU linked list managing objects which are
-allocated with SLAB_TYPESAFE_BY_RCU kmem_cache can
-use following algos :
-
-1) Lookup algo
---------------
-rcu_read_lock()
-begin:
-obj = lockless_lookup(key);
-if (obj) {
-  if (!try_get_ref(obj)) // might fail for free objects
-    goto begin;
-  /*
-   * Because a writer could delete object, and a writer could
-   * reuse these object before the RCU grace period, we
-   * must check key after getting the reference on object
-   */
-  if (obj->key != key) { // not the object we expected
-     put_ref(obj);
-     goto begin;
-   }
-}
-rcu_read_unlock();
-
-Beware that lockless_lookup(key) cannot use traditional hlist_for_each_entry_rcu()
-but a version with an additional memory barrier (smp_rmb())
-
-lockless_lookup(key)
-{
-   struct hlist_node *node, *next;
-   for (pos = rcu_dereference((head)->first);
-          pos && ({ next = pos->next; smp_rmb(); prefetch(next); 1; }) &&
-          ({ tpos = hlist_entry(pos, typeof(*tpos), member); 1; });
-          pos = rcu_dereference(next))
-      if (obj->key == key)
-         return obj;
-   return NULL;
-
-And note the traditional hlist_for_each_entry_rcu() misses this smp_rmb() :
-
-   struct hlist_node *node;
-   for (pos = rcu_dereference((head)->first);
-		pos && ({ prefetch(pos->next); 1; }) &&
-		({ tpos = hlist_entry(pos, typeof(*tpos), member); 1; });
-		pos = rcu_dereference(pos->next))
-      if (obj->key == key)
-         return obj;
-   return NULL;
-}
-
-Quoting Corey Minyard :
-
-"If the object is moved from one list to another list in-between the
- time the hash is calculated and the next field is accessed, and the
- object has moved to the end of a new list, the traversal will not
- complete properly on the list it should have, since the object will
- be on the end of the new list and there's not a way to tell it's on a
- new list and restart the list traversal.  I think that this can be
- solved by pre-fetching the "next" field (with proper barriers) before
- checking the key."
-
-2) Insert algo :
-----------------
-
-We need to make sure a reader cannot read the new 'obj->obj_next' value
-and previous value of 'obj->key'. Or else, an item could be deleted
-from a chain, and inserted into another chain. If new chain was empty
-before the move, 'next' pointer is NULL, and lockless reader can
-not detect it missed following items in original chain.
-
-/*
- * Please note that new inserts are done at the head of list,
- * not in the middle or end.
- */
-obj = kmem_cache_alloc(...);
-lock_chain(); // typically a spin_lock()
-obj->key = key;
-/*
- * we need to make sure obj->key is updated before obj->next
- * or obj->refcnt
- */
-smp_wmb();
-atomic_set(&obj->refcnt, 1);
-hlist_add_head_rcu(&obj->obj_node, list);
-unlock_chain(); // typically a spin_unlock()
-
-
-3) Remove algo
---------------
-Nothing special here, we can use a standard RCU hlist deletion.
-But thanks to SLAB_TYPESAFE_BY_RCU, beware a deleted object can be reused
-very very fast (before the end of RCU grace period)
-
-if (put_last_reference_on(obj) {
-   lock_chain(); // typically a spin_lock()
-   hlist_del_init_rcu(&obj->obj_node);
-   unlock_chain(); // typically a spin_unlock()
-   kmem_cache_free(cachep, obj);
-}
-
-
-
---------------------------------------------------------------------------
-With hlist_nulls we can avoid extra smp_rmb() in lockless_lookup()
-and extra smp_wmb() in insert function.
-
-For example, if we choose to store the slot number as the 'nulls'
-end-of-list marker for each slot of the hash table, we can detect
-a race (some writer did a delete and/or a move of an object
-to another chain) checking the final 'nulls' value if
-the lookup met the end of chain. If final 'nulls' value
-is not the slot number, then we must restart the lookup at
-the beginning. If the object was moved to the same chain,
-then the reader doesn't care : It might eventually
-scan the list again without harm.
-
-
-1) lookup algo
-
- head = &table[slot];
- rcu_read_lock();
-begin:
- hlist_nulls_for_each_entry_rcu(obj, node, head, member) {
-   if (obj->key == key) {
-      if (!try_get_ref(obj)) // might fail for free objects
-         goto begin;
-      if (obj->key != key) { // not the object we expected
-         put_ref(obj);
-         goto begin;
-      }
-  goto out;
- }
-/*
- * if the nulls value we got at the end of this lookup is
- * not the expected one, we must restart lookup.
- * We probably met an item that was moved to another chain.
- */
- if (get_nulls_value(node) != slot)
-   goto begin;
- obj = NULL;
-
-out:
- rcu_read_unlock();
-
-2) Insert function :
---------------------
-
-/*
- * Please note that new inserts are done at the head of list,
- * not in the middle or end.
- */
-obj = kmem_cache_alloc(cachep);
-lock_chain(); // typically a spin_lock()
-obj->key = key;
-/*
- * changes to obj->key must be visible before refcnt one
- */
-smp_wmb();
-atomic_set(&obj->refcnt, 1);
-/*
- * insert obj in RCU way (readers might be traversing chain)
- */
-hlist_nulls_add_head_rcu(&obj->obj_node, list);
-unlock_chain(); // typically a spin_unlock()
diff --git a/include/linux/rculist_nulls.h b/include/linux/rculist_nulls.h
index 9670b54b484a..ff3e94779e73 100644
--- a/include/linux/rculist_nulls.h
+++ b/include/linux/rculist_nulls.h
@@ -162,7 +162,7 @@ static inline void hlist_nulls_add_fake(struct hlist_nulls_node *n)
  * The barrier() is needed to make sure compiler doesn't cache first element [1],
  * as this loop can be restarted [2]
  * [1] Documentation/core-api/atomic_ops.rst around line 114
- * [2] Documentation/RCU/rculist_nulls.txt around line 146
+ * [2] Documentation/RCU/rculist_nulls.rst around line 146
  */
 #define hlist_nulls_for_each_entry_rcu(tpos, pos, head, member)			\
 	for (({barrier();}),							\
diff --git a/net/core/sock.c b/net/core/sock.c
index d832c650287c..6921a85a1177 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -1973,7 +1973,7 @@ struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
 
 		/*
 		 * Before updating sk_refcnt, we must commit prior changes to memory
-		 * (Documentation/RCU/rculist_nulls.txt for details)
+		 * (Documentation/RCU/rculist_nulls.rst for details)
 		 */
 		smp_wmb();
 		refcount_set(&newsk->sk_refcnt, 2);
@@ -3035,7 +3035,7 @@ void sock_init_data(struct socket *sock, struct sock *sk)
 	sk_rx_queue_clear(sk);
 	/*
 	 * Before updating sk_refcnt, we must commit prior changes to memory
-	 * (Documentation/RCU/rculist_nulls.txt for details)
+	 * (Documentation/RCU/rculist_nulls.rst for details)
 	 */
 	smp_wmb();
 	refcount_set(&sk->sk_refcnt, 1);

From 43cb5451dffe0bc5d59688d4898c9a1f7c40d3b4 Mon Sep 17 00:00:00 2001
From: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
Date: Tue, 21 Apr 2020 19:04:06 +0200
Subject: [PATCH 081/502] docs: RCU: Convert torture.txt to ReST

- Add a SPDX header;
- Adjust document and section titles;
- Some whitespace fixes and new line breaks;
- Mark literal blocks as such;
- Add it to RCU/index.rst.

Signed-off-by: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 Documentation/RCU/index.rst                   |   1 +
 .../RCU/{torture.txt => torture.rst}          | 115 ++++++++++--------
 Documentation/locking/locktorture.rst         |   2 +-
 MAINTAINERS                                   |   4 +-
 kernel/rcu/rcutorture.c                       |   2 +-
 5 files changed, 68 insertions(+), 56 deletions(-)
 rename Documentation/RCU/{torture.txt => torture.rst} (76%)

diff --git a/Documentation/RCU/index.rst b/Documentation/RCU/index.rst
index 577a47e27f5d..5d5f9a1ab8f9 100644
--- a/Documentation/RCU/index.rst
+++ b/Documentation/RCU/index.rst
@@ -18,6 +18,7 @@ RCU concepts
    whatisRCU
    rcu
    rculist_nulls
+   torture
    listRCU
    NMI-RCU
    UP
diff --git a/Documentation/RCU/torture.txt b/Documentation/RCU/torture.rst
similarity index 76%
rename from Documentation/RCU/torture.txt
rename to Documentation/RCU/torture.rst
index af712a3c5b6a..a90147713062 100644
--- a/Documentation/RCU/torture.txt
+++ b/Documentation/RCU/torture.rst
@@ -1,7 +1,12 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+==========================
 RCU Torture Test Operation
+==========================
 
 
 CONFIG_RCU_TORTURE_TEST
+=======================
 
 The CONFIG_RCU_TORTURE_TEST config option is available for all RCU
 implementations.  It creates an rcutorture kernel module that can
@@ -13,9 +18,10 @@ when the module is loaded, and stops when the module is unloaded.
 Module parameters are prefixed by "rcutorture." in
 Documentation/admin-guide/kernel-parameters.txt.
 
-OUTPUT
+Output
+======
 
-The statistics output is as follows:
+The statistics output is as follows::
 
 	rcu-torture:--- Start of test: nreaders=16 nfakewriters=4 stat_interval=30 verbose=0 test_no_idle_hz=1 shuffle_interval=3 stutter=5 irqreader=1 fqs_duration=0 fqs_holdoff=0 fqs_stutter=3 test_boost=1/0 test_boost_interval=7 test_boost_duration=4
 	rcu-torture: rtc:           (null) ver: 155441 tfle: 0 rta: 155441 rtaf: 8884 rtf: 155440 rtmbe: 0 rtbe: 0 rtbke: 0 rtbre: 0 rtbf: 0 rtb: 0 nt: 3055767
@@ -36,53 +42,53 @@ automatic determination as to whether RCU operated correctly.
 
 The entries are as follows:
 
-o	"rtc": The hexadecimal address of the structure currently visible
+*	"rtc": The hexadecimal address of the structure currently visible
 	to readers.
 
-o	"ver": The number of times since boot that the RCU writer task
+*	"ver": The number of times since boot that the RCU writer task
 	has changed the structure visible to readers.
 
-o	"tfle": If non-zero, indicates that the "torture freelist"
+*	"tfle": If non-zero, indicates that the "torture freelist"
 	containing structures to be placed into the "rtc" area is empty.
 	This condition is important, since it can fool you into thinking
 	that RCU is working when it is not.  :-/
 
-o	"rta": Number of structures allocated from the torture freelist.
+*	"rta": Number of structures allocated from the torture freelist.
 
-o	"rtaf": Number of allocations from the torture freelist that have
+*	"rtaf": Number of allocations from the torture freelist that have
 	failed due to the list being empty.  It is not unusual for this
 	to be non-zero, but it is bad for it to be a large fraction of
 	the value indicated by "rta".
 
-o	"rtf": Number of frees into the torture freelist.
+*	"rtf": Number of frees into the torture freelist.
 
-o	"rtmbe": A non-zero value indicates that rcutorture believes that
+*	"rtmbe": A non-zero value indicates that rcutorture believes that
 	rcu_assign_pointer() and rcu_dereference() are not working
 	correctly.  This value should be zero.
 
-o	"rtbe": A non-zero value indicates that one of the rcu_barrier()
+*	"rtbe": A non-zero value indicates that one of the rcu_barrier()
 	family of functions is not working correctly.
 
-o	"rtbke": rcutorture was unable to create the real-time kthreads
+*	"rtbke": rcutorture was unable to create the real-time kthreads
 	used to force RCU priority inversion.  This value should be zero.
 
-o	"rtbre": Although rcutorture successfully created the kthreads
+*	"rtbre": Although rcutorture successfully created the kthreads
 	used to force RCU priority inversion, it was unable to set them
 	to the real-time priority level of 1.  This value should be zero.
 
-o	"rtbf": The number of times that RCU priority boosting failed
+*	"rtbf": The number of times that RCU priority boosting failed
 	to resolve RCU priority inversion.
 
-o	"rtb": The number of times that rcutorture attempted to force
+*	"rtb": The number of times that rcutorture attempted to force
 	an RCU priority inversion condition.  If you are testing RCU
 	priority boosting via the "test_boost" module parameter, this
 	value should be non-zero.
 
-o	"nt": The number of times rcutorture ran RCU read-side code from
+*	"nt": The number of times rcutorture ran RCU read-side code from
 	within a timer handler.  This value should be non-zero only
 	if you specified the "irqreader" module parameter.
 
-o	"Reader Pipe": Histogram of "ages" of structures seen by readers.
+*	"Reader Pipe": Histogram of "ages" of structures seen by readers.
 	If any entries past the first two are non-zero, RCU is broken.
 	And rcutorture prints the error flag string "!!!" to make sure
 	you notice.  The age of a newly allocated structure is zero,
@@ -94,14 +100,14 @@ o	"Reader Pipe": Histogram of "ages" of structures seen by readers.
 	RCU.  If you want to see what it looks like when broken, break
 	it yourself.  ;-)
 
-o	"Reader Batch": Another histogram of "ages" of structures seen
+*	"Reader Batch": Another histogram of "ages" of structures seen
 	by readers, but in terms of counter flips (or batches) rather
 	than in terms of grace periods.  The legal number of non-zero
 	entries is again two.  The reason for this separate view is that
 	it is sometimes easier to get the third entry to show up in the
 	"Reader Batch" list than in the "Reader Pipe" list.
 
-o	"Free-Block Circulation": Shows the number of torture structures
+*	"Free-Block Circulation": Shows the number of torture structures
 	that have reached a given point in the pipeline.  The first element
 	should closely correspond to the number of structures allocated,
 	the second to the number that have been removed from reader view,
@@ -112,7 +118,7 @@ o	"Free-Block Circulation": Shows the number of torture structures
 
 Different implementations of RCU can provide implementation-specific
 additional information.  For example, Tree SRCU provides the following
-additional line:
+additional line::
 
 	srcud-torture: Tree SRCU per-CPU(idx=0): 0(35,-21) 1(-4,24) 2(1,1) 3(-26,20) 4(28,-47) 5(-9,4) 6(-10,14) 7(-14,11) T(1,6)
 
@@ -123,15 +129,15 @@ using a dynamically allocated srcu_struct (hence "srcud-" rather than
 "old" and "current" values to the underlying array, and is useful for
 debugging.  The final "T" entry contains the totals of the counters.
 
-
-USAGE ON SPECIFIC KERNEL BUILDS
+Usage on Specific Kernel Builds
+===============================
 
 It is sometimes desirable to torture RCU on a specific kernel build,
 for example, when preparing to put that kernel build into production.
 In that case, the kernel should be built with CONFIG_RCU_TORTURE_TEST=m
 so that the test can be started using modprobe and terminated using rmmod.
 
-For example, the following script may be used to torture RCU:
+For example, the following script may be used to torture RCU::
 
 	#!/bin/sh
 
@@ -148,7 +154,8 @@ two are self-explanatory, while the last indicates that while there
 were no RCU failures, CPU-hotplug problems were detected.
 
 
-USAGE ON MAINLINE KERNELS
+Usage on Mainline Kernels
+=========================
 
 When using rcutorture to test changes to RCU itself, it is often
 necessary to build a number of kernels in order to test that change
@@ -180,16 +187,16 @@ to Tree SRCU might run only the SRCU-N and SRCU-P scenarios using the
 --configs argument to kvm.sh as follows:  "--configs 'SRCU-N SRCU-P'".
 Large systems can run multiple copies of of the full set of scenarios,
 for example, a system with 448 hardware threads can run five instances
-of the full set concurrently.  To make this happen:
+of the full set concurrently.  To make this happen::
 
 	kvm.sh --cpus 448 --configs '5*CFLIST'
 
 Alternatively, such a system can run 56 concurrent instances of a single
-eight-CPU scenario:
+eight-CPU scenario::
 
 	kvm.sh --cpus 448 --configs '56*TREE04'
 
-Or 28 concurrent instances of each of two eight-CPU scenarios:
+Or 28 concurrent instances of each of two eight-CPU scenarios::
 
 	kvm.sh --cpus 448 --configs '28*TREE03 28*TREE04'
 
@@ -199,14 +206,14 @@ values for memory may require disabling the callback-flooding tests
 using the --bootargs parameter discussed below.
 
 Sometimes additional debugging is useful, and in such cases the --kconfig
-parameter to kvm.sh may be used, for example, "--kconfig 'CONFIG_KASAN=y'".
+parameter to kvm.sh may be used, for example, ``--kconfig 'CONFIG_KASAN=y'``.
 
 Kernel boot arguments can also be supplied, for example, to control
 rcutorture's module parameters.  For example, to test a change to RCU's
 CPU stall-warning code, use "--bootargs 'rcutorture.stall_cpu=30'".
 This will of course result in the scripting reporting a failure, namely
 the resuling RCU CPU stall warning.  As noted above, reducing memory may
-require disabling rcutorture's callback-flooding tests:
+require disabling rcutorture's callback-flooding tests::
 
 	kvm.sh --cpus 448 --configs '56*TREE04' --memory 128M \
 		--bootargs 'rcutorture.fwd_progress=0'
@@ -225,7 +232,7 @@ is listed at the end of the kvm.sh output, which you really should redirect
 to a file.  The build products and console output of each run is kept in
 tools/testing/selftests/rcutorture/res in timestamped directories.  A
 given directory can be supplied to kvm-find-errors.sh in order to have
-it cycle you through summaries of errors and full error logs.  For example:
+it cycle you through summaries of errors and full error logs.  For example::
 
 	tools/testing/selftests/rcutorture/bin/kvm-find-errors.sh \
 		tools/testing/selftests/rcutorture/res/2020.01.20-15.54.23
@@ -245,38 +252,42 @@ that was tested and any uncommitted changes in diff format.
 
 The most frequently used files in each per-scenario-run directory are:
 
-.config: This file contains the Kconfig options.
+.config:
+	This file contains the Kconfig options.
 
-Make.out: This contains build output for a specific scenario.
+Make.out:
+	This contains build output for a specific scenario.
 
-console.log: This contains the console output for a specific scenario.
+console.log:
+	This contains the console output for a specific scenario.
 	This file may be examined once the kernel has booted, but
 	it might not exist if the build failed.
 
-vmlinux: This contains the kernel, which can be useful with tools like
+vmlinux:
+	This contains the kernel, which can be useful with tools like
 	objdump and gdb.
 
 A number of additional files are available, but are less frequently used.
 Many are intended for debugging of rcutorture itself or of its scripting.
 
 As of v5.4, a successful run with the default set of scenarios produces
-the following summary at the end of the run on a 12-CPU system:
+the following summary at the end of the run on a 12-CPU system::
 
-SRCU-N ------- 804233 GPs (148.932/s) [srcu: g10008272 f0x0 ]
-SRCU-P ------- 202320 GPs (37.4667/s) [srcud: g1809476 f0x0 ]
-SRCU-t ------- 1122086 GPs (207.794/s) [srcu: g0 f0x0 ]
-SRCU-u ------- 1111285 GPs (205.794/s) [srcud: g1 f0x0 ]
-TASKS01 ------- 19666 GPs (3.64185/s) [tasks: g0 f0x0 ]
-TASKS02 ------- 20541 GPs (3.80389/s) [tasks: g0 f0x0 ]
-TASKS03 ------- 19416 GPs (3.59556/s) [tasks: g0 f0x0 ]
-TINY01 ------- 836134 GPs (154.84/s) [rcu: g0 f0x0 ] n_max_cbs: 34198
-TINY02 ------- 850371 GPs (157.476/s) [rcu: g0 f0x0 ] n_max_cbs: 2631
-TREE01 ------- 162625 GPs (30.1157/s) [rcu: g1124169 f0x0 ]
-TREE02 ------- 333003 GPs (61.6672/s) [rcu: g2647753 f0x0 ] n_max_cbs: 35844
-TREE03 ------- 306623 GPs (56.782/s) [rcu: g2975325 f0x0 ] n_max_cbs: 1496497
-CPU count limited from 16 to 12
-TREE04 ------- 246149 GPs (45.5831/s) [rcu: g1695737 f0x0 ] n_max_cbs: 434961
-TREE05 ------- 314603 GPs (58.2598/s) [rcu: g2257741 f0x2 ] n_max_cbs: 193997
-TREE07 ------- 167347 GPs (30.9902/s) [rcu: g1079021 f0x0 ] n_max_cbs: 478732
-CPU count limited from 16 to 12
-TREE09 ------- 752238 GPs (139.303/s) [rcu: g13075057 f0x0 ] n_max_cbs: 99011
+    SRCU-N ------- 804233 GPs (148.932/s) [srcu: g10008272 f0x0 ]
+    SRCU-P ------- 202320 GPs (37.4667/s) [srcud: g1809476 f0x0 ]
+    SRCU-t ------- 1122086 GPs (207.794/s) [srcu: g0 f0x0 ]
+    SRCU-u ------- 1111285 GPs (205.794/s) [srcud: g1 f0x0 ]
+    TASKS01 ------- 19666 GPs (3.64185/s) [tasks: g0 f0x0 ]
+    TASKS02 ------- 20541 GPs (3.80389/s) [tasks: g0 f0x0 ]
+    TASKS03 ------- 19416 GPs (3.59556/s) [tasks: g0 f0x0 ]
+    TINY01 ------- 836134 GPs (154.84/s) [rcu: g0 f0x0 ] n_max_cbs: 34198
+    TINY02 ------- 850371 GPs (157.476/s) [rcu: g0 f0x0 ] n_max_cbs: 2631
+    TREE01 ------- 162625 GPs (30.1157/s) [rcu: g1124169 f0x0 ]
+    TREE02 ------- 333003 GPs (61.6672/s) [rcu: g2647753 f0x0 ] n_max_cbs: 35844
+    TREE03 ------- 306623 GPs (56.782/s) [rcu: g2975325 f0x0 ] n_max_cbs: 1496497
+    CPU count limited from 16 to 12
+    TREE04 ------- 246149 GPs (45.5831/s) [rcu: g1695737 f0x0 ] n_max_cbs: 434961
+    TREE05 ------- 314603 GPs (58.2598/s) [rcu: g2257741 f0x2 ] n_max_cbs: 193997
+    TREE07 ------- 167347 GPs (30.9902/s) [rcu: g1079021 f0x0 ] n_max_cbs: 478732
+    CPU count limited from 16 to 12
+    TREE09 ------- 752238 GPs (139.303/s) [rcu: g13075057 f0x0 ] n_max_cbs: 99011
diff --git a/Documentation/locking/locktorture.rst b/Documentation/locking/locktorture.rst
index 8012a74555e7..dfaf9fc883f4 100644
--- a/Documentation/locking/locktorture.rst
+++ b/Documentation/locking/locktorture.rst
@@ -166,4 +166,4 @@ checked for such errors.  The "rmmod" command forces a "SUCCESS",
 two are self-explanatory, while the last indicates that while there
 were no locking failures, CPU-hotplug problems were detected.
 
-Also see: Documentation/RCU/torture.txt
+Also see: Documentation/RCU/torture.rst
diff --git a/MAINTAINERS b/MAINTAINERS
index 496fd4eafb68..4429ce965b3a 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -14437,7 +14437,7 @@ T:	git git://git.kernel.org/pub/scm/linux/kernel/git/paulmck/linux-rcu.git dev
 F:	Documentation/RCU/
 F:	include/linux/rcu*
 F:	kernel/rcu/
-X:	Documentation/RCU/torture.txt
+X:	Documentation/RCU/torture.rst
 X:	include/linux/srcu*.h
 X:	kernel/rcu/srcu*.c
 
@@ -17288,7 +17288,7 @@ M:	Josh Triplett <josh@joshtriplett.org>
 L:	linux-kernel@vger.kernel.org
 S:	Supported
 T:	git git://git.kernel.org/pub/scm/linux/kernel/git/paulmck/linux-rcu.git dev
-F:	Documentation/RCU/torture.txt
+F:	Documentation/RCU/torture.rst
 F:	kernel/locking/locktorture.c
 F:	kernel/rcu/rcuperf.c
 F:	kernel/rcu/rcutorture.c
diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
index efb792e13fca..8205295fc33e 100644
--- a/kernel/rcu/rcutorture.c
+++ b/kernel/rcu/rcutorture.c
@@ -7,7 +7,7 @@
  * Authors: Paul E. McKenney <paulmck@linux.ibm.com>
  *	  Josh Triplett <josh@joshtriplett.org>
  *
- * See also:  Documentation/RCU/torture.txt
+ * See also:  Documentation/RCU/torture.rst
  */
 
 #define pr_fmt(fmt) fmt

From 90c73cb2c65f9e78eb09a8cbcd4bcd4add2d3f4d Mon Sep 17 00:00:00 2001
From: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
Date: Tue, 21 Apr 2020 19:04:07 +0200
Subject: [PATCH 082/502] docs: RCU: Convert rcuref.txt to ReST

- Add a SPDX header;
- Adjust document title;
- Some whitespace fixes and new line breaks;
- Mark literal blocks as such;
- Add it to RCU/index.rst.

Signed-off-by: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 Documentation/RCU/index.rst                  |   1 +
 Documentation/RCU/{rcuref.txt => rcuref.rst} | 193 ++++++++++---------
 2 files changed, 101 insertions(+), 93 deletions(-)
 rename Documentation/RCU/{rcuref.txt => rcuref.rst} (50%)

diff --git a/Documentation/RCU/index.rst b/Documentation/RCU/index.rst
index 5d5f9a1ab8f9..9a1d51f394dc 100644
--- a/Documentation/RCU/index.rst
+++ b/Documentation/RCU/index.rst
@@ -18,6 +18,7 @@ RCU concepts
    whatisRCU
    rcu
    rculist_nulls
+   rcuref
    torture
    listRCU
    NMI-RCU
diff --git a/Documentation/RCU/rcuref.txt b/Documentation/RCU/rcuref.rst
similarity index 50%
rename from Documentation/RCU/rcuref.txt
rename to Documentation/RCU/rcuref.rst
index 5e6429d66c24..b33aeb14fde3 100644
--- a/Documentation/RCU/rcuref.txt
+++ b/Documentation/RCU/rcuref.rst
@@ -1,4 +1,8 @@
-Reference-count design for elements of lists/arrays protected by RCU.
+.. SPDX-License-Identifier: GPL-2.0
+
+====================================================================
+Reference-count design for elements of lists/arrays protected by RCU
+====================================================================
 
 
 Please note that the percpu-ref feature is likely your first
@@ -12,32 +16,33 @@ please read on.
 Reference counting on elements of lists which are protected by traditional
 reader/writer spinlocks or semaphores are straightforward:
 
-CODE LISTING A:
-1.				2.
-add()				search_and_reference()
-{				{
-    alloc_object		    read_lock(&list_lock);
-    ...				    search_for_element
-    atomic_set(&el->rc, 1);	    atomic_inc(&el->rc);
-    write_lock(&list_lock);	     ...
-    add_element			    read_unlock(&list_lock);
-    ...				    ...
-    write_unlock(&list_lock);	}
-}
+CODE LISTING A::
 
-3.					4.
-release_referenced()			delete()
-{					{
-    ...					    write_lock(&list_lock);
-    if(atomic_dec_and_test(&el->rc))	    ...
-	kfree(el);
-    ...					    remove_element
-}					    write_unlock(&list_lock);
- 					    ...
-					    if (atomic_dec_and_test(&el->rc))
-					        kfree(el);
-					    ...
-					}
+    1.					    2.
+    add()				    search_and_reference()
+    {					    {
+	alloc_object				read_lock(&list_lock);
+	...					search_for_element
+	atomic_set(&el->rc, 1);			atomic_inc(&el->rc);
+	write_lock(&list_lock);			 ...
+	add_element				read_unlock(&list_lock);
+	...					...
+	write_unlock(&list_lock);	   }
+    }
+
+    3.					    4.
+    release_referenced()		    delete()
+    {					    {
+	...					write_lock(&list_lock);
+	if(atomic_dec_and_test(&el->rc))	...
+	    kfree(el);
+	...					remove_element
+    }						write_unlock(&list_lock);
+						...
+						if (atomic_dec_and_test(&el->rc))
+						    kfree(el);
+						...
+					    }
 
 If this list/array is made lock free using RCU as in changing the
 write_lock() in add() and delete() to spin_lock() and changing read_lock()
@@ -46,34 +51,35 @@ search_and_reference() could potentially hold reference to an element which
 has already been deleted from the list/array.  Use atomic_inc_not_zero()
 in this scenario as follows:
 
-CODE LISTING B:
-1.					2.
-add()					search_and_reference()
-{					{
-    alloc_object			    rcu_read_lock();
-    ...					    search_for_element
-    atomic_set(&el->rc, 1);		    if (!atomic_inc_not_zero(&el->rc)) {
-    spin_lock(&list_lock);		        rcu_read_unlock();
-					        return FAIL;
-    add_element				    }
-    ...					    ...
-    spin_unlock(&list_lock);		    rcu_read_unlock();
-}					}
-3.					4.
-release_referenced()			delete()
-{					{
-    ...					    spin_lock(&list_lock);
-    if (atomic_dec_and_test(&el->rc))       ...
-        call_rcu(&el->head, el_free);       remove_element
-    ...                                     spin_unlock(&list_lock);
-} 					    ...
-					    if (atomic_dec_and_test(&el->rc))
-					        call_rcu(&el->head, el_free);
-					    ...
-					}
+CODE LISTING B::
+
+    1.					    2.
+    add()				    search_and_reference()
+    {					    {
+	alloc_object				rcu_read_lock();
+	...					search_for_element
+	atomic_set(&el->rc, 1);			if (!atomic_inc_not_zero(&el->rc)) {
+	spin_lock(&list_lock);			    rcu_read_unlock();
+						    return FAIL;
+	add_element				}
+	...					...
+	spin_unlock(&list_lock);		rcu_read_unlock();
+    }					    }
+    3.					    4.
+    release_referenced()		    delete()
+    {					    {
+	...					spin_lock(&list_lock);
+	if (atomic_dec_and_test(&el->rc))	...
+	    call_rcu(&el->head, el_free);	remove_element
+	...					spin_unlock(&list_lock);
+    }						...
+						if (atomic_dec_and_test(&el->rc))
+						    call_rcu(&el->head, el_free);
+						...
+					    }
 
 Sometimes, a reference to the element needs to be obtained in the
-update (write) stream.  In such cases, atomic_inc_not_zero() might be
+update (write) stream.	In such cases, atomic_inc_not_zero() might be
 overkill, since we hold the update-side spinlock.  One might instead
 use atomic_inc() in such cases.
 
@@ -82,39 +88,40 @@ search_and_reference() code path.  In such cases, the
 atomic_dec_and_test() may be moved from delete() to el_free()
 as follows:
 
-CODE LISTING C:
-1.					2.
-add()					search_and_reference()
-{					{
-    alloc_object			    rcu_read_lock();
-    ...					    search_for_element
-    atomic_set(&el->rc, 1);		    atomic_inc(&el->rc);
-    spin_lock(&list_lock);		    ...
+CODE LISTING C::
 
-    add_element				    rcu_read_unlock();
-    ...					}
-    spin_unlock(&list_lock);		4.
-}					delete()
-3.					{
-release_referenced()			    spin_lock(&list_lock);
-{					    ...
-    ...					    remove_element
-    if (atomic_dec_and_test(&el->rc))       spin_unlock(&list_lock);
-        kfree(el);			    ...
-    ...                                     call_rcu(&el->head, el_free);
-} 					    ...
-5.					}
-void el_free(struct rcu_head *rhp)
-{
-    release_referenced();
-}
+    1.					    2.
+    add()				    search_and_reference()
+    {					    {
+	alloc_object				rcu_read_lock();
+	...					search_for_element
+	atomic_set(&el->rc, 1);			atomic_inc(&el->rc);
+	spin_lock(&list_lock);			...
+
+	add_element				rcu_read_unlock();
+	...				    }
+	spin_unlock(&list_lock);	    4.
+    }					    delete()
+    3.					    {
+    release_referenced()			spin_lock(&list_lock);
+    {						...
+	...					remove_element
+	if (atomic_dec_and_test(&el->rc))	spin_unlock(&list_lock);
+	    kfree(el);				...
+	...					call_rcu(&el->head, el_free);
+    }						...
+    5.					    }
+    void el_free(struct rcu_head *rhp)
+    {
+	release_referenced();
+    }
 
 The key point is that the initial reference added by add() is not removed
 until after a grace period has elapsed following removal.  This means that
 search_and_reference() cannot find this element, which means that the value
 of el->rc cannot increase.  Thus, once it reaches zero, there are no
-readers that can or ever will be able to reference the element.  The
-element can therefore safely be freed.  This in turn guarantees that if
+readers that can or ever will be able to reference the element.	 The
+element can therefore safely be freed.	This in turn guarantees that if
 any reader finds the element, that reader may safely acquire a reference
 without checking the value of the reference counter.
 
@@ -130,21 +137,21 @@ the eventual invocation of kfree(), which is usually not a problem on
 modern computer systems, even the small ones.
 
 In cases where delete() can sleep, synchronize_rcu() can be called from
-delete(), so that el_free() can be subsumed into delete as follows:
+delete(), so that el_free() can be subsumed into delete as follows::
 
-4.
-delete()
-{
-    spin_lock(&list_lock);
-    ...
-    remove_element
-    spin_unlock(&list_lock);
-    ...
-    synchronize_rcu();
-    if (atomic_dec_and_test(&el->rc))
-    	kfree(el);
-    ...
-}
+    4.
+    delete()
+    {
+	spin_lock(&list_lock);
+	...
+	remove_element
+	spin_unlock(&list_lock);
+	...
+	synchronize_rcu();
+	if (atomic_dec_and_test(&el->rc))
+	    kfree(el);
+	...
+    }
 
 As additional examples in the kernel, the pattern in listing C is used by
 reference counting of struct pid, while the pattern in listing B is used by

From f2286ab99549271f3cec73e305b9ecca95d91394 Mon Sep 17 00:00:00 2001
From: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
Date: Tue, 21 Apr 2020 19:04:10 +0200
Subject: [PATCH 083/502] docs: RCU: Convert stallwarn.txt to ReST

- Add a SPDX header;
- Adjust document and section titles;
- Fix list markups;
- Some whitespace fixes and new line breaks;
- Mark literal blocks as such;
- Add it to RCU/index.rst.

Signed-off-by: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 Documentation/RCU/index.rst                   |  1 +
 .../RCU/{stallwarn.txt => stallwarn.rst}      | 55 ++++++++++++-------
 kernel/rcu/tree_stall.h                       |  4 +-
 3 files changed, 37 insertions(+), 23 deletions(-)
 rename Documentation/RCU/{stallwarn.txt => stallwarn.rst} (90%)

diff --git a/Documentation/RCU/index.rst b/Documentation/RCU/index.rst
index 9a1d51f394dc..e703d3dbe60c 100644
--- a/Documentation/RCU/index.rst
+++ b/Documentation/RCU/index.rst
@@ -20,6 +20,7 @@ RCU concepts
    rculist_nulls
    rcuref
    torture
+   stallwarn
    listRCU
    NMI-RCU
    UP
diff --git a/Documentation/RCU/stallwarn.txt b/Documentation/RCU/stallwarn.rst
similarity index 90%
rename from Documentation/RCU/stallwarn.txt
rename to Documentation/RCU/stallwarn.rst
index a360a8796710..08bc9aec4606 100644
--- a/Documentation/RCU/stallwarn.txt
+++ b/Documentation/RCU/stallwarn.rst
@@ -1,4 +1,8 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+==============================
 Using RCU's CPU Stall Detector
+==============================
 
 This document first discusses what sorts of issues RCU's CPU stall
 detector can locate, and then discusses kernel parameters and Kconfig
@@ -7,39 +11,40 @@ this document explains the stall detector's "splat" format.
 
 
 What Causes RCU CPU Stall Warnings?
+===================================
 
 So your kernel printed an RCU CPU stall warning.  The next question is
 "What caused it?"  The following problems can result in RCU CPU stall
 warnings:
 
-o	A CPU looping in an RCU read-side critical section.
+-	A CPU looping in an RCU read-side critical section.
 
-o	A CPU looping with interrupts disabled.
+-	A CPU looping with interrupts disabled.
 
-o	A CPU looping with preemption disabled.
+-	A CPU looping with preemption disabled.
 
-o	A CPU looping with bottom halves disabled.
+-	A CPU looping with bottom halves disabled.
 
-o	For !CONFIG_PREEMPT kernels, a CPU looping anywhere in the kernel
+-	For !CONFIG_PREEMPT kernels, a CPU looping anywhere in the kernel
 	without invoking schedule().  If the looping in the kernel is
 	really expected and desirable behavior, you might need to add
 	some calls to cond_resched().
 
-o	Booting Linux using a console connection that is too slow to
+-	Booting Linux using a console connection that is too slow to
 	keep up with the boot-time console-message rate.  For example,
 	a 115Kbaud serial console can be -way- too slow to keep up
 	with boot-time message rates, and will frequently result in
 	RCU CPU stall warning messages.  Especially if you have added
 	debug printk()s.
 
-o	Anything that prevents RCU's grace-period kthreads from running.
+-	Anything that prevents RCU's grace-period kthreads from running.
 	This can result in the "All QSes seen" console-log message.
 	This message will include information on when the kthread last
 	ran and how often it should be expected to run.  It can also
-	result in the "rcu_.*kthread starved for" console-log message,
+	result in the ``rcu_.*kthread starved for`` console-log message,
 	which will include additional debugging information.
 
-o	A CPU-bound real-time task in a CONFIG_PREEMPT kernel, which might
+-	A CPU-bound real-time task in a CONFIG_PREEMPT kernel, which might
 	happen to preempt a low-priority task in the middle of an RCU
 	read-side critical section.   This is especially damaging if
 	that low-priority task is not permitted to run on any other CPU,
@@ -48,7 +53,7 @@ o	A CPU-bound real-time task in a CONFIG_PREEMPT kernel, which might
 	While the system is in the process of running itself out of
 	memory, you might see stall-warning messages.
 
-o	A CPU-bound real-time task in a CONFIG_PREEMPT_RT kernel that
+-	A CPU-bound real-time task in a CONFIG_PREEMPT_RT kernel that
 	is running at a higher priority than the RCU softirq threads.
 	This will prevent RCU callbacks from ever being invoked,
 	and in a CONFIG_PREEMPT_RCU kernel will further prevent
@@ -63,7 +68,7 @@ o	A CPU-bound real-time task in a CONFIG_PREEMPT_RT kernel that
 	can increase your system's context-switch rate and thus degrade
 	performance.
 
-o	A periodic interrupt whose handler takes longer than the time
+-	A periodic interrupt whose handler takes longer than the time
 	interval between successive pairs of interrupts.  This can
 	prevent RCU's kthreads and softirq handlers from running.
 	Note that certain high-overhead debugging options, for example
@@ -71,20 +76,20 @@ o	A periodic interrupt whose handler takes longer than the time
 	considerably longer than normal, which can in turn result in
 	RCU CPU stall warnings.
 
-o	Testing a workload on a fast system, tuning the stall-warning
+-	Testing a workload on a fast system, tuning the stall-warning
 	timeout down to just barely avoid RCU CPU stall warnings, and then
 	running the same workload with the same stall-warning timeout on a
 	slow system.  Note that thermal throttling and on-demand governors
 	can cause a single system to be sometimes fast and sometimes slow!
 
-o	A hardware or software issue shuts off the scheduler-clock
+-	A hardware or software issue shuts off the scheduler-clock
 	interrupt on a CPU that is not in dyntick-idle mode.  This
 	problem really has happened, and seems to be most likely to
 	result in RCU CPU stall warnings for CONFIG_NO_HZ_COMMON=n kernels.
 
-o	A bug in the RCU implementation.
+-	A bug in the RCU implementation.
 
-o	A hardware failure.  This is quite unlikely, but has occurred
+-	A hardware failure.  This is quite unlikely, but has occurred
 	at least once in real life.  A CPU failed in a running system,
 	becoming unresponsive, but not causing an immediate crash.
 	This resulted in a series of RCU CPU stall warnings, eventually
@@ -109,6 +114,7 @@ see include/trace/events/rcu.h.
 
 
 Fine-Tuning the RCU CPU Stall Detector
+======================================
 
 The rcuupdate.rcu_cpu_stall_suppress module parameter disables RCU's
 CPU stall detector, which detects conditions that unduly delay RCU grace
@@ -118,6 +124,7 @@ The stall detector's idea of what constitutes "unduly delayed" is
 controlled by a set of kernel configuration variables and cpp macros:
 
 CONFIG_RCU_CPU_STALL_TIMEOUT
+----------------------------
 
 	This kernel configuration parameter defines the period of time
 	that RCU will wait from the beginning of a grace period until it
@@ -137,6 +144,7 @@ CONFIG_RCU_CPU_STALL_TIMEOUT
 	/sys/module/rcupdate/parameters/rcu_cpu_stall_suppress.
 
 RCU_STALL_DELAY_DELTA
+---------------------
 
 	Although the lockdep facility is extremely useful, it does add
 	some overhead.  Therefore, under CONFIG_PROVE_RCU, the
@@ -145,6 +153,7 @@ RCU_STALL_DELAY_DELTA
 	macro, not a kernel configuration parameter.)
 
 RCU_STALL_RAT_DELAY
+-------------------
 
 	The CPU stall detector tries to make the offending CPU print its
 	own warnings, as this often gives better-quality stack traces.
@@ -155,6 +164,7 @@ RCU_STALL_RAT_DELAY
 	parameter.)
 
 rcupdate.rcu_task_stall_timeout
+-------------------------------
 
 	This boot/sysfs parameter controls the RCU-tasks stall warning
 	interval.  A value of zero or less suppresses RCU-tasks stall
@@ -168,9 +178,10 @@ rcupdate.rcu_task_stall_timeout
 
 
 Interpreting RCU's CPU Stall-Detector "Splats"
+==============================================
 
 For non-RCU-tasks flavors of RCU, when a CPU detects that it is stalling,
-it will print a message similar to the following:
+it will print a message similar to the following::
 
 	INFO: rcu_sched detected stalls on CPUs/tasks:
 	2-...: (3 GPs behind) idle=06c/0/0 softirq=1453/1455 fqs=0
@@ -223,7 +234,7 @@ an estimate of the total number of RCU callbacks queued across all CPUs
 (625 in this case).
 
 In kernels with CONFIG_RCU_FAST_NO_HZ, more information is printed
-for each CPU:
+for each CPU::
 
 	0: (64628 ticks this GP) idle=dd5/3fffffffffffffff/0 softirq=82/543 last_accelerate: a345/d342 dyntick_enabled: 1
 
@@ -235,7 +246,7 @@ processing is enabled.
 
 If the grace period ends just as the stall warning starts printing,
 there will be a spurious stall-warning message, which will include
-the following:
+the following::
 
 	INFO: Stall ended before state dump start
 
@@ -248,7 +259,7 @@ which is overkill for this sort of problem.
 
 If all CPUs and tasks have passed through quiescent states, but the
 grace period has nevertheless failed to end, the stall-warning splat
-will include something like the following:
+will include something like the following::
 
 	All QSes seen, last rcu_preempt kthread activity 23807 (4297905177-4297881370), jiffies_till_next_fqs=3, root ->qsmask 0x0
 
@@ -261,7 +272,7 @@ which is way less than 23807.  Finally, the root rcu_node structure's
 
 If the relevant grace-period kthread has been unable to run prior to
 the stall warning, as was the case in the "All QSes seen" line above,
-the following additional line is printed:
+the following additional line is printed::
 
 	kthread starved for 23807 jiffies! g7075 f0x0 RCU_GP_WAIT_FQS(3) ->state=0x1 ->cpu=5
 
@@ -276,6 +287,7 @@ kthread last ran on CPU 5.
 
 
 Multiple Warnings From One Stall
+================================
 
 If a stall lasts long enough, multiple stall-warning messages will be
 printed for it.  The second and subsequent messages are printed at
@@ -285,9 +297,10 @@ of the stall and the first message.
 
 
 Stall Warnings for Expedited Grace Periods
+==========================================
 
 If an expedited grace period detects a stall, it will place a message
-like the following in dmesg:
+like the following in dmesg::
 
 	INFO: rcu_sched detected expedited stalls on CPUs/tasks: { 7-... } 21119 jiffies s: 73 root: 0x2/.
 
diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h
index 54a6dba0280d..b04256cd7e12 100644
--- a/kernel/rcu/tree_stall.h
+++ b/kernel/rcu/tree_stall.h
@@ -468,7 +468,7 @@ static void print_other_cpu_stall(unsigned long gp_seq, unsigned long gps)
 
 	/*
 	 * OK, time to rat on our buddy...
-	 * See Documentation/RCU/stallwarn.txt for info on how to debug
+	 * See Documentation/RCU/stallwarn.rst for info on how to debug
 	 * RCU CPU stall warnings.
 	 */
 	pr_err("INFO: %s detected stalls on CPUs/tasks:\n", rcu_state.name);
@@ -535,7 +535,7 @@ static void print_cpu_stall(unsigned long gps)
 
 	/*
 	 * OK, time to rat on ourselves...
-	 * See Documentation/RCU/stallwarn.txt for info on how to debug
+	 * See Documentation/RCU/stallwarn.rst for info on how to debug
 	 * RCU CPU stall warnings.
 	 */
 	pr_err("INFO: %s self-detected stall on CPU\n", rcu_state.name);

From 2d9c318bfd15394da014737bee30e7b2e22c5eac Mon Sep 17 00:00:00 2001
From: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
Date: Tue, 21 Apr 2020 19:04:11 +0200
Subject: [PATCH 084/502] docs: RCU: Don't duplicate chapter names in
 rculist_nulls.rst

Since changeset 58ad30cf91f0 ("docs: fix reference to core-api/namespaces.rst"),
auto-references for chapters are generated. This is a nice feature, but
has a drawback: no chapters can have the same sumber.

So, we need to add two higher hierarchy chapters on this document,
in order to avoid such duplication.

Fixes: 58ad30cf91f0 ("docs: fix reference to core-api/namespaces.rst")
Signed-off-by: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 Documentation/RCU/rculist_nulls.rst | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/Documentation/RCU/rculist_nulls.rst b/Documentation/RCU/rculist_nulls.rst
index d40374221d69..a9fc774bc400 100644
--- a/Documentation/RCU/rculist_nulls.rst
+++ b/Documentation/RCU/rculist_nulls.rst
@@ -10,6 +10,9 @@ objects using SLAB_TYPESAFE_BY_RCU allocations.
 
 Please read the basics in Documentation/RCU/listRCU.rst
 
+Using 'nulls'
+=============
+
 Using special makers (called 'nulls') is a convenient way
 to solve following problem :
 
@@ -126,6 +129,9 @@ very very fast (before the end of RCU grace period)
 
 --------------------------------------------------------------------------
 
+Avoiding extra smp_rmb()
+========================
+
 With hlist_nulls we can avoid extra smp_rmb() in lockless_lookup()
 and extra smp_wmb() in insert function.
 

From b81898e3d2133715e4475d25757595a3e18502ed Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Thu, 30 Apr 2020 12:23:11 -0700
Subject: [PATCH 085/502] doc: Timer problems can cause RCU CPU stall warnings

Over the past few years, there have been several cases where timekeeping
bugs have caused RCU CPU stall warnings, particularly during hardware
bringup.  This commit therefore adds such bugs to the list of things
that can result in RCU CPU stall warnings.

Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 Documentation/RCU/stallwarn.rst | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/Documentation/RCU/stallwarn.rst b/Documentation/RCU/stallwarn.rst
index 08bc9aec4606..c9ab6af4d3be 100644
--- a/Documentation/RCU/stallwarn.rst
+++ b/Documentation/RCU/stallwarn.rst
@@ -87,6 +87,13 @@ warnings:
 	problem really has happened, and seems to be most likely to
 	result in RCU CPU stall warnings for CONFIG_NO_HZ_COMMON=n kernels.
 
+-	A hardware or software issue that prevents time-based wakeups
+	from occurring.  These issues can range from misconfigured or
+	buggy timer hardware through bugs in the interrupt or exception
+	path (whether hardware, firmware, or software) through bugs
+	in Linux's timer subsystem through bugs in the scheduler, and,
+	yes, even including bugs in RCU itself.
+
 -	A bug in the RCU implementation.
 
 -	A hardware failure.  This is quite unlikely, but has occurred

From d93d97cbe0d4369153fb04954f1481a9f42aa5b6 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Mon, 11 May 2020 19:52:34 -0700
Subject: [PATCH 086/502] doc: Tasks RCU must protect instructions before
 trampoline

Protecting the code in a trampoline can also require protecting a
number of instructions prior to actually entering the trampoline.
For example, these earlier instructions might be computing the address
of the trampoline.  This commit therefore updates RCU's requirements to
record this for posterity.

Link: https://lore.kernel.org/lkml/20200511154824.09a18c46@gandalf.local.home/
Reported-by: Lai Jiangshan <jiangshanlai@gmail.com>
Reported-by: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 Documentation/RCU/Design/Requirements/Requirements.rst | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/Documentation/RCU/Design/Requirements/Requirements.rst b/Documentation/RCU/Design/Requirements/Requirements.rst
index 75b8ca007a11..a69b5c43a10c 100644
--- a/Documentation/RCU/Design/Requirements/Requirements.rst
+++ b/Documentation/RCU/Design/Requirements/Requirements.rst
@@ -2583,7 +2583,12 @@ not work to have these markers in the trampoline itself, because there
 would need to be instructions following ``rcu_read_unlock()``. Although
 ``synchronize_rcu()`` would guarantee that execution reached the
 ``rcu_read_unlock()``, it would not be able to guarantee that execution
-had completely left the trampoline.
+had completely left the trampoline. Worse yet, in some situations
+the trampoline's protection must extend a few instructions *prior* to
+execution reaching the trampoline.  For example, these few instructions
+might calculate the address of the trampoline, so that entering the
+trampoline would be pre-ordained a surprisingly long time before execution
+actually reached the trampoline itself.
 
 The solution, in the form of `Tasks
 RCU <https://lwn.net/Articles/607117/>`__, is to have implicit read-side

From 7ee880b7bf1dea88d0a472b775aebdb4fb6bf860 Mon Sep 17 00:00:00 2001
From: Wei Yang <richard.weiyang@gmail.com>
Date: Wed, 15 Apr 2020 22:26:55 +0000
Subject: [PATCH 087/502] rcu: Initialize and destroy rcu_synchronize only when
 necessary

The __wait_rcu_gp() function unconditionally initializes and cleans up
each element of rs_array[], whether used or not.  This is slightly
wasteful and rather confusing, so this commit skips both initialization
and cleanup for duplicate callback functions.

Signed-off-by: Wei Yang <richard.weiyang@gmail.com>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 kernel/rcu/update.c | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c
index 84843adfd939..f5a82e107bcb 100644
--- a/kernel/rcu/update.c
+++ b/kernel/rcu/update.c
@@ -390,13 +390,14 @@ void __wait_rcu_gp(bool checktiny, int n, call_rcu_func_t *crcu_array,
 			might_sleep();
 			continue;
 		}
-		init_rcu_head_on_stack(&rs_array[i].head);
-		init_completion(&rs_array[i].completion);
 		for (j = 0; j < i; j++)
 			if (crcu_array[j] == crcu_array[i])
 				break;
-		if (j == i)
+		if (j == i) {
+			init_rcu_head_on_stack(&rs_array[i].head);
+			init_completion(&rs_array[i].completion);
 			(crcu_array[i])(&rs_array[i].head, wakeme_after_rcu);
+		}
 	}
 
 	/* Wait for all callbacks to be invoked. */
@@ -407,9 +408,10 @@ void __wait_rcu_gp(bool checktiny, int n, call_rcu_func_t *crcu_array,
 		for (j = 0; j < i; j++)
 			if (crcu_array[j] == crcu_array[i])
 				break;
-		if (j == i)
+		if (j == i) {
 			wait_for_completion(&rs_array[i].completion);
-		destroy_rcu_head_on_stack(&rs_array[i].head);
+			destroy_rcu_head_on_stack(&rs_array[i].head);
+		}
 	}
 }
 EXPORT_SYMBOL_GPL(__wait_rcu_gp);

From 0a3b3c253a1eb2c7fe7f34086d46660c909abeb3 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Thu, 16 Apr 2020 16:46:10 -0700
Subject: [PATCH 088/502] mm/mmap.c: Add cond_resched() for exit_mmap() CPU
 stalls

A large process running on a heavily loaded system can encounter the
following RCU CPU stall warning:

  rcu: INFO: rcu_sched self-detected stall on CPU
  rcu: 	3-....: (20998 ticks this GP) idle=4ea/1/0x4000000000000002 softirq=556558/556558 fqs=5190
  	(t=21013 jiffies g=1005461 q=132576)
  NMI backtrace for cpu 3
  CPU: 3 PID: 501900 Comm: aio-free-ring-w Kdump: loaded Not tainted 5.2.9-108_fbk12_rc3_3858_gb83b75af7909 #1
  Hardware name: Wiwynn   HoneyBadger/PantherPlus, BIOS HBM6.71 02/03/2016
  Call Trace:
   <IRQ>
   dump_stack+0x46/0x60
   nmi_cpu_backtrace.cold.3+0x13/0x50
   ? lapic_can_unplug_cpu.cold.27+0x34/0x34
   nmi_trigger_cpumask_backtrace+0xba/0xca
   rcu_dump_cpu_stacks+0x99/0xc7
   rcu_sched_clock_irq.cold.87+0x1aa/0x397
   ? tick_sched_do_timer+0x60/0x60
   update_process_times+0x28/0x60
   tick_sched_timer+0x37/0x70
   __hrtimer_run_queues+0xfe/0x270
   hrtimer_interrupt+0xf4/0x210
   smp_apic_timer_interrupt+0x5e/0x120
   apic_timer_interrupt+0xf/0x20
   </IRQ>
  RIP: 0010:kmem_cache_free+0x223/0x300
  Code: 88 00 00 00 0f 85 ca 00 00 00 41 8b 55 18 31 f6 f7 da 41 f6 45 0a 02 40 0f 94 c6 83 c6 05 9c 41 5e fa e8 a0 a7 01 00 41 56 9d <49> 8b 47 08 a8 03 0f 85 87 00 00 00 65 48 ff 08 e9 3d fe ff ff 65
  RSP: 0018:ffffc9000e8e3da8 EFLAGS: 00000206 ORIG_RAX: ffffffffffffff13
  RAX: 0000000000020000 RBX: ffff88861b9de960 RCX: 0000000000000030
  RDX: fffffffffffe41e8 RSI: 000060777fe3a100 RDI: 000000000001be18
  RBP: ffffea00186e7780 R08: ffffffffffffffff R09: ffffffffffffffff
  R10: ffff88861b9dea28 R11: ffff88887ffde000 R12: ffffffff81230a1f
  R13: ffff888854684dc0 R14: 0000000000000206 R15: ffff8888547dbc00
   ? remove_vma+0x4f/0x60
   remove_vma+0x4f/0x60
   exit_mmap+0xd6/0x160
   mmput+0x4a/0x110
   do_exit+0x278/0xae0
   ? syscall_trace_enter+0x1d3/0x2b0
   ? handle_mm_fault+0xaa/0x1c0
   do_group_exit+0x3a/0xa0
   __x64_sys_exit_group+0x14/0x20
   do_syscall_64+0x42/0x100
   entry_SYSCALL_64_after_hwframe+0x44/0xa9

And on a PREEMPT=n kernel, the "while (vma)" loop in exit_mmap() can run
for a very long time given a large process.  This commit therefore adds
a cond_resched() to this loop, providing RCU any needed quiescent states.

Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: <linux-mm@kvack.org>
Reviewed-by: Shakeel Butt <shakeelb@google.com>
Reviewed-by: Joel Fernandes (Google) <joel@joelfernandes.org>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 mm/mmap.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/mm/mmap.c b/mm/mmap.c
index 59a4682ebf3f..972f839c6ec8 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -3159,6 +3159,7 @@ void exit_mmap(struct mm_struct *mm)
 		if (vma->vm_flags & VM_ACCOUNT)
 			nr_accounted += vma_pages(vma);
 		vma = remove_vma(vma);
+		cond_resched();
 	}
 	vm_unacct_memory(nr_accounted);
 }

From abfce0414814149f716e1d30da1fb3140d1b3473 Mon Sep 17 00:00:00 2001
From: Wei Yang <richard.weiyang@gmail.com>
Date: Sun, 19 Apr 2020 21:57:15 +0000
Subject: [PATCH 089/502] rcu: Simplify the calculation of rcu_state.ncpus

There is only 1 bit set in mask, which means that the only difference
between oldmask and the new one will be at the position where the bit is
set in mask.  This commit therefore updates rcu_state.ncpus by checking
whether the bit in mask is already set in rnp->expmaskinitnext.

Signed-off-by: Wei Yang <richard.weiyang@gmail.com>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 kernel/rcu/tree.c | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 6c6569e0586c..bef1dc91bfbe 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -3842,10 +3842,9 @@ void rcu_cpu_starting(unsigned int cpu)
 {
 	unsigned long flags;
 	unsigned long mask;
-	int nbits;
-	unsigned long oldmask;
 	struct rcu_data *rdp;
 	struct rcu_node *rnp;
+	bool newcpu;
 
 	if (per_cpu(rcu_cpu_started, cpu))
 		return;
@@ -3857,12 +3856,10 @@ void rcu_cpu_starting(unsigned int cpu)
 	mask = rdp->grpmask;
 	raw_spin_lock_irqsave_rcu_node(rnp, flags);
 	WRITE_ONCE(rnp->qsmaskinitnext, rnp->qsmaskinitnext | mask);
-	oldmask = rnp->expmaskinitnext;
+	newcpu = !(rnp->expmaskinitnext & mask);
 	rnp->expmaskinitnext |= mask;
-	oldmask ^= rnp->expmaskinitnext;
-	nbits = bitmap_weight(&oldmask, BITS_PER_LONG);
 	/* Allow lockless access for expedited grace periods. */
-	smp_store_release(&rcu_state.ncpus, rcu_state.ncpus + nbits); /* ^^^ */
+	smp_store_release(&rcu_state.ncpus, rcu_state.ncpus + newcpu); /* ^^^ */
 	ASSERT_EXCLUSIVE_WRITER(rcu_state.ncpus);
 	rcu_gpnum_ovf(rnp, rdp); /* Offline-induced counter wrap? */
 	rdp->rcu_onl_gp_seq = READ_ONCE(rcu_state.gp_seq);

From e816d56fad57ba9817cef6606b12f5e14647c3bf Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Fri, 1 May 2020 16:49:48 -0700
Subject: [PATCH 090/502] rcu: Add callbacks-invoked counters

This commit adds a count of the callbacks invoked to the per-CPU rcu_data
structure.  This count is printed by the show_rcu_gp_kthreads() that
is invoked by rcutorture and the RCU CPU stall-warning code.  It is also
intended for use by drgn.

Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 kernel/rcu/tree.c       | 1 +
 kernel/rcu/tree.h       | 1 +
 kernel/rcu/tree_stall.h | 3 +++
 3 files changed, 5 insertions(+)

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index bef1dc91bfbe..874c831bcc45 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -2443,6 +2443,7 @@ static void rcu_do_batch(struct rcu_data *rdp)
 	local_irq_save(flags);
 	rcu_nocb_lock(rdp);
 	count = -rcl.len;
+	rdp->n_cbs_invoked += count;
 	trace_rcu_batch_end(rcu_state.name, count, !!rcl.head, need_resched(),
 			    is_idle_task(current), rcu_is_callbacks_kthread());
 
diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
index 43991a40b084..9c6f7343bec0 100644
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -171,6 +171,7 @@ struct rcu_data {
 					/* different grace periods. */
 	long		qlen_last_fqs_check;
 					/* qlen at last check for QS forcing */
+	unsigned long	n_cbs_invoked;	/* # callbacks invoked since boot. */
 	unsigned long	n_force_qs_snap;
 					/* did other CPU force QS recently? */
 	long		blimit;		/* Upper limit on a processed batch */
diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h
index 54a6dba0280d..2768ce6bf657 100644
--- a/kernel/rcu/tree_stall.h
+++ b/kernel/rcu/tree_stall.h
@@ -649,6 +649,7 @@ static void check_cpu_stall(struct rcu_data *rdp)
  */
 void show_rcu_gp_kthreads(void)
 {
+	unsigned long cbs = 0;
 	int cpu;
 	unsigned long j;
 	unsigned long ja;
@@ -690,9 +691,11 @@ void show_rcu_gp_kthreads(void)
 	}
 	for_each_possible_cpu(cpu) {
 		rdp = per_cpu_ptr(&rcu_data, cpu);
+		cbs += data_race(rdp->n_cbs_invoked);
 		if (rcu_segcblist_is_offloaded(&rdp->cblist))
 			show_rcu_nocb_state(rdp);
 	}
+	pr_info("RCU callbacks invoked since boot: %lu\n", cbs);
 	show_rcu_tasks_gp_kthreads();
 }
 EXPORT_SYMBOL_GPL(show_rcu_gp_kthreads);

From f8466f94685b5bd931384526cf51e090fd2ac706 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Sun, 3 May 2020 19:16:09 -0700
Subject: [PATCH 091/502] rcu: Add comment documenting rcu_callback_map's
 purpose

The rcu_callback_map lockdep_map structure was added back in 2013, but
its purpose has become obscure.  This commit therefore documments that the
purpose of rcu_callback map is, in the words of commit 24ef659a857 ("rcu:
Provide better diagnostics for blocking in RCU callback functions"),
to help lockdep to tie an "inappropriate voluntary context switch back
to the fact that the function is being invoked from within a callback."

Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 kernel/rcu/update.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c
index f5a82e107bcb..ca17b771ad60 100644
--- a/kernel/rcu/update.c
+++ b/kernel/rcu/update.c
@@ -279,6 +279,7 @@ struct lockdep_map rcu_sched_lock_map = {
 };
 EXPORT_SYMBOL_GPL(rcu_sched_lock_map);
 
+// Tell lockdep when RCU callbacks are being invoked.
 static struct lock_class_key rcu_callback_key;
 struct lockdep_map rcu_callback_map =
 	STATIC_LOCKDEP_MAP_INIT("rcu_callback", &rcu_callback_key);

From 88748e330040ecf4681a2c8f344fd386862bf913 Mon Sep 17 00:00:00 2001
From: Madhuparna Bhowmik <madhuparnabhowmik10@gmail.com>
Date: Mon, 4 May 2020 08:05:05 -0400
Subject: [PATCH 092/502] trace: events: rcu: Change description of rcu_dyntick
 trace event

The different strings used for describing the polarity are
Start, End and StillNonIdle. Since StillIdle is not used in any trace
point for rcu_dyntick, it can be removed and StillNonIdle can be added
in the description. Because StillNonIdle is used in a few tracepoints for
rcu_dyntick.

Similarly, USER, IDLE and IRQ are used for describing context in
the rcu_dyntick tracepoints. Since, "KERNEL" is not used for any
of the rcu_dyntick tracepoints, remove it from the description.

Signed-off-by: Madhuparna Bhowmik <madhuparnabhowmik10@gmail.com>
Signed-off-by: Joel Fernandes (Google) <joel@joelfernandes.org>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 include/trace/events/rcu.h | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/include/trace/events/rcu.h b/include/trace/events/rcu.h
index f9a7811148e2..af274d1532bf 100644
--- a/include/trace/events/rcu.h
+++ b/include/trace/events/rcu.h
@@ -435,11 +435,12 @@ TRACE_EVENT_RCU(rcu_fqs,
 #endif /* #if defined(CONFIG_TREE_RCU) */
 
 /*
- * Tracepoint for dyntick-idle entry/exit events.  These take a string
- * as argument: "Start" for entering dyntick-idle mode, "Startirq" for
- * entering it from irq/NMI, "End" for leaving it, "Endirq" for leaving it
- * to irq/NMI, "--=" for events moving towards idle, and "++=" for events
- * moving away from idle.
+ * Tracepoint for dyntick-idle entry/exit events.  These take 2 strings
+ * as argument:
+ * polarity: "Start", "End", "StillNonIdle" for entering, exiting or still not
+ *            being in dyntick-idle mode.
+ * context: "USER" or "IDLE" or "IRQ".
+ * NMIs nested in IRQs are inferred with dynticks_nesting > 1 in IRQ context.
  *
  * These events also take a pair of numbers, which indicate the nesting
  * depth before and after the event of interest, and a third number that is

From 77865dea25c4f45ce0c5bf61a8470af01fccd944 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Thu, 7 May 2020 15:44:46 -0700
Subject: [PATCH 093/502] rcu: Grace-period-kthread related sleeps to idle
 priority

This commit converts the long-standing schedule_timeout_interruptible()
and schedule_timeout_uninterruptible() calls used by RCU's grace-period
kthread to schedule_timeout_idle().  This conversion avoids polluting
the load-average with RCU-related sleeping.

Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 kernel/rcu/tree.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 874c831bcc45..feb31c201dee 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -1638,7 +1638,7 @@ static void rcu_gp_slow(int delay)
 	if (delay > 0 &&
 	    !(rcu_seq_ctr(rcu_state.gp_seq) %
 	      (rcu_num_nodes * PER_RCU_NODE_PERIOD * delay)))
-		schedule_timeout_uninterruptible(delay);
+		schedule_timeout_idle(delay);
 }
 
 static unsigned long sleep_duration;
@@ -1661,7 +1661,7 @@ static void rcu_gp_torture_wait(void)
 	duration = xchg(&sleep_duration, 0UL);
 	if (duration > 0) {
 		pr_alert("%s: Waiting %lu jiffies\n", __func__, duration);
-		schedule_timeout_uninterruptible(duration);
+		schedule_timeout_idle(duration);
 		pr_alert("%s: Wait complete\n", __func__);
 	}
 }
@@ -2727,7 +2727,7 @@ static void rcu_cpu_kthread(unsigned int cpu)
 	}
 	*statusp = RCU_KTHREAD_YIELDING;
 	trace_rcu_utilization(TPS("Start CPU kthread@rcu_yield"));
-	schedule_timeout_interruptible(2);
+	schedule_timeout_idle(2);
 	trace_rcu_utilization(TPS("End CPU kthread@rcu_yield"));
 	*statusp = RCU_KTHREAD_WAITING;
 }

From a9352f72d6a9e8fe4840b9f0d97af8f5a6c52c79 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Thu, 7 May 2020 16:34:38 -0700
Subject: [PATCH 094/502] rcu: Priority-boost-related sleeps to idle priority

This commit converts the long-standing schedule_timeout_interruptible()
call used by RCU's priority-boosting kthreads to schedule_timeout_idle().
This conversion avoids polluting the load-average with RCU-related
sleeping.

Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 kernel/rcu/tree_plugin.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index 352223664ebd..25296c17a30d 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -1033,7 +1033,7 @@ static int rcu_boost_kthread(void *arg)
 		if (spincnt > 10) {
 			WRITE_ONCE(rnp->boost_kthread_status, RCU_KTHREAD_YIELDING);
 			trace_rcu_utilization(TPS("End boost kthread@rcu_yield"));
-			schedule_timeout_interruptible(2);
+			schedule_timeout_idle(2);
 			trace_rcu_utilization(TPS("Start boost kthread@rcu_yield"));
 			spincnt = 0;
 		}

From f5ca34643bbd84f514bdeee194c45dd1fb066ef2 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Thu, 7 May 2020 16:36:10 -0700
Subject: [PATCH 095/502] rcu: No-CBs-related sleeps to idle priority

This commit converts the schedule_timeout_interruptible() call used by
RCU's no-CBs grace-period kthreads to schedule_timeout_idle().  This
conversion avoids polluting the load-average with RCU-related sleeping.

Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 kernel/rcu/tree_plugin.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index 25296c17a30d..982fc5be5269 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -2005,7 +2005,7 @@ static void nocb_gp_wait(struct rcu_data *my_rdp)
 		/* Polling, so trace if first poll in the series. */
 		if (gotcbs)
 			trace_rcu_nocb_wake(rcu_state.name, cpu, TPS("Poll"));
-		schedule_timeout_interruptible(1);
+		schedule_timeout_idle(1);
 	} else if (!needwait_gp) {
 		/* Wait for callbacks to appear. */
 		trace_rcu_nocb_wake(rcu_state.name, cpu, TPS("Sleep"));

From 68c2f27e01f61760e6ae76fff9682e1ffe9bacb6 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Thu, 7 May 2020 16:38:29 -0700
Subject: [PATCH 096/502] rcu: Expedited grace-period sleeps to idle priority

This commit converts the schedule_timeout_uninterruptible() call used
by RCU's expedited grace-period processing to schedule_timeout_idle().
This conversion avoids polluting the load-average with RCU-related
sleeping.

Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 kernel/rcu/tree_exp.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h
index 72952edad1e4..1888c0eb1216 100644
--- a/kernel/rcu/tree_exp.h
+++ b/kernel/rcu/tree_exp.h
@@ -403,7 +403,7 @@ retry_ipi:
 			/* Online, so delay for a bit and try again. */
 			raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
 			trace_rcu_exp_grace_period(rcu_state.name, rcu_exp_gp_seq_endval(), TPS("selectofl"));
-			schedule_timeout_uninterruptible(1);
+			schedule_timeout_idle(1);
 			goto retry_ipi;
 		}
 		/* CPU really is offline, so we must report its QS. */

From 9f47eb5461aaeb6cb8696f9d11503ae90e4d5cb0 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Fri, 8 May 2020 14:15:37 -0700
Subject: [PATCH 097/502] fs/btrfs: Add cond_resched() for
 try_release_extent_mapping() stalls

Very large I/Os can cause the following RCU CPU stall warning:

RIP: 0010:rb_prev+0x8/0x50
Code: 49 89 c0 49 89 d1 48 89 c2 48 89 f8 e9 e5 fd ff ff 4c 89 48 10 c3 4c =
89 06 c3 4c 89 40 10 c3 0f 1f 00 48 8b 0f 48 39 cf 74 38 <48> 8b 47 10 48 85 c0 74 22 48 8b 50 08 48 85 d2 74 0c 48 89 d0 48
RSP: 0018:ffffc9002212bab0 EFLAGS: 00000287 ORIG_RAX: ffffffffffffff13
RAX: ffff888821f93630 RBX: ffff888821f93630 RCX: ffff888821f937e0
RDX: 0000000000000000 RSI: 0000000000102000 RDI: ffff888821f93630
RBP: 0000000000103000 R08: 000000000006c000 R09: 0000000000000238
R10: 0000000000102fff R11: ffffc9002212bac8 R12: 0000000000000001
R13: ffffffffffffffff R14: 0000000000102000 R15: ffff888821f937e0
 __lookup_extent_mapping+0xa0/0x110
 try_release_extent_mapping+0xdc/0x220
 btrfs_releasepage+0x45/0x70
 shrink_page_list+0xa39/0xb30
 shrink_inactive_list+0x18f/0x3b0
 shrink_lruvec+0x38e/0x6b0
 shrink_node+0x14d/0x690
 do_try_to_free_pages+0xc6/0x3e0
 try_to_free_mem_cgroup_pages+0xe6/0x1e0
 reclaim_high.constprop.73+0x87/0xc0
 mem_cgroup_handle_over_high+0x66/0x150
 exit_to_usermode_loop+0x82/0xd0
 do_syscall_64+0xd4/0x100
 entry_SYSCALL_64_after_hwframe+0x44/0xa9

On a PREEMPT=n kernel, the try_release_extent_mapping() function's
"while" loop might run for a very long time on a large I/O.  This commit
therefore adds a cond_resched() to this loop, providing RCU any needed
quiescent states.

Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 fs/btrfs/extent_io.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 68c96057ad2d..704239546093 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -4515,6 +4515,8 @@ int try_release_extent_mapping(struct page *page, gfp_t mask)
 
 			/* once for us */
 			free_extent_map(em);
+
+			cond_resched(); /* Allow large-extent preemption. */
 		}
 	}
 	return try_release_extent_state(tree, page, mask);

From 360fbbb4897c98971e8955b063c01250817a2191 Mon Sep 17 00:00:00 2001
From: Lihao Liang <lihaoliang@google.com>
Date: Thu, 14 May 2020 21:34:34 +0100
Subject: [PATCH 098/502] rcu: Update comment from rsp->rcu_gp_seq to
 rsp->gp_seq

Signed-off-by: Lihao Liang <lihaoliang@google.com>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 kernel/rcu/tree.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
index 9c6f7343bec0..575745f0a464 100644
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -41,7 +41,7 @@ struct rcu_node {
 	raw_spinlock_t __private lock;	/* Root rcu_node's lock protects */
 					/*  some rcu_state fields as well as */
 					/*  following. */
-	unsigned long gp_seq;	/* Track rsp->rcu_gp_seq. */
+	unsigned long gp_seq;	/* Track rsp->gp_seq. */
 	unsigned long gp_seq_needed; /* Track furthest future GP request. */
 	unsigned long completedqs; /* All QSes done for this node. */
 	unsigned long qsmask;	/* CPUs or groups that need to switch in */
@@ -149,7 +149,7 @@ union rcu_noqs {
 /* Per-CPU data for read-copy update. */
 struct rcu_data {
 	/* 1) quiescent-state and grace-period handling : */
-	unsigned long	gp_seq;		/* Track rsp->rcu_gp_seq counter. */
+	unsigned long	gp_seq;		/* Track rsp->gp_seq counter. */
 	unsigned long	gp_seq_needed;	/* Track furthest future GP request. */
 	union rcu_noqs	cpu_no_qs;	/* No QSes yet for this CPU. */
 	bool		core_needs_qs;	/* Core waits for quiesc state. */

From 3c8920e2dbd1a55f72dc14d656df9d0097cf5c72 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <frederic@kernel.org>
Date: Fri, 15 May 2020 02:34:29 +0200
Subject: [PATCH 099/502] tick/nohz: Narrow down noise while setting current
 task's tick dependency

Setting a tick dependency on any task, including the case where a task
sets that dependency on itself, triggers an IPI to all CPUs.  That is
of course suboptimal but it had previously not been an issue because it
was only used by POSIX CPU timers on nohz_full, which apparently never
occurs in latency-sensitive workloads in production.  (Or users of such
systems are suffering in silence on the one hand or venting their ire
on the wrong people on the other.)

But RCU now sets a task tick dependency on the current task in order
to fix stall issues that can occur during RCU callback processing.
Thus, RCU callback processing triggers frequent system-wide IPIs from
nohz_full CPUs.  This is quite counter-productive, after all, avoiding
IPIs is what nohz_full is supposed to be all about.

This commit therefore optimizes tasks' self-setting of a task tick
dependency by using tick_nohz_full_kick() to avoid the system-wide IPI.
Instead, only the execution of the one task is disturbed, which is
acceptable given that this disturbance is well down into the noise
compared to the degree to which the RCU callback processing itself
disturbs execution.

Fixes: 6a949b7af82d (rcu: Force on tick when invoking lots of callbacks)
Reported-by: Matt Fleming <matt@codeblueprint.co.uk>
Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
Cc: stable@kernel.org
Cc: Paul E. McKenney <paulmck@kernel.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Ingo Molnar <mingo@kernel.org>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 kernel/time/tick-sched.c | 22 +++++++++++++++-------
 1 file changed, 15 insertions(+), 7 deletions(-)

diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 3e2dc9b8858c..f0199a4ba1ad 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -351,16 +351,24 @@ void tick_nohz_dep_clear_cpu(int cpu, enum tick_dep_bits bit)
 EXPORT_SYMBOL_GPL(tick_nohz_dep_clear_cpu);
 
 /*
- * Set a per-task tick dependency. Posix CPU timers need this in order to elapse
- * per task timers.
+ * Set a per-task tick dependency. RCU need this. Also posix CPU timers
+ * in order to elapse per task timers.
  */
 void tick_nohz_dep_set_task(struct task_struct *tsk, enum tick_dep_bits bit)
 {
-	/*
-	 * We could optimize this with just kicking the target running the task
-	 * if that noise matters for nohz full users.
-	 */
-	tick_nohz_dep_set_all(&tsk->tick_dep_mask, bit);
+	if (!atomic_fetch_or(BIT(bit), &tsk->tick_dep_mask)) {
+		if (tsk == current) {
+			preempt_disable();
+			tick_nohz_full_kick();
+			preempt_enable();
+		} else {
+			/*
+			 * Some future tick_nohz_full_kick_task()
+			 * should optimize this.
+			 */
+			tick_nohz_full_kick_all();
+		}
+	}
 }
 EXPORT_SYMBOL_GPL(tick_nohz_dep_set_task);
 

From 55fbe86ef303bc8ab040e579fba34a750c08200e Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Tue, 19 May 2020 15:02:02 -0700
Subject: [PATCH 100/502] rcu: Remove initialized but unused rnp from
 check_slow_task()

This commit removes the variable rnp from check_slow_task(), which
is defined, assigned to, but not otherwise used.

Reported-by: kbuild test robot <lkp@intel.com>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 kernel/rcu/tree_stall.h | 2 --
 1 file changed, 2 deletions(-)

diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h
index 2768ce6bf657..d203f82a380a 100644
--- a/kernel/rcu/tree_stall.h
+++ b/kernel/rcu/tree_stall.h
@@ -237,14 +237,12 @@ struct rcu_stall_chk_rdr {
  */
 static bool check_slow_task(struct task_struct *t, void *arg)
 {
-	struct rcu_node *rnp;
 	struct rcu_stall_chk_rdr *rscrp = arg;
 
 	if (task_curr(t))
 		return false; // It is running, so decline to inspect it.
 	rscrp->nesting = t->rcu_read_lock_nesting;
 	rscrp->rs = t->rcu_read_unlock_special;
-	rnp = t->rcu_blocked_node;
 	rscrp->on_blkd_list = !list_empty(&t->rcu_node_entry);
 	return true;
 }

From 04b25a495bd68c1dad07263fb91e8b5a31c00a9e Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Tue, 19 May 2020 17:00:54 -0700
Subject: [PATCH 101/502] rcu: Mark rcu_nmi_enter() call to
 rcu_cleanup_after_idle() noinstr

The objtool complains about the call to rcu_cleanup_after_idle() from
rcu_nmi_enter(), so this commit adds instrumentation_begin() before that
call and instrumentation_end() after it.

Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 kernel/rcu/tree.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index feb31c201dee..d17e5a08bf43 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -990,8 +990,11 @@ noinstr void rcu_nmi_enter(void)
 		rcu_dynticks_eqs_exit();
 		// ... but is watching here.
 
-		if (!in_nmi())
+		if (!in_nmi()) {
+			instrumentation_begin();
 			rcu_cleanup_after_idle();
+			instrumentation_end();
+		}
 
 		instrumentation_begin();
 		// instrumentation for the noinstr rcu_dynticks_curr_cpu_in_eqs()

From d29e0b26b020422cc51b5b51733cc50fcf443965 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Thu, 28 May 2020 08:49:29 -0700
Subject: [PATCH 102/502] lockdep: Complain only once about RCU in extended
 quiescent state

Currently, lockdep_rcu_suspicious() complains twice about RCU read-side
critical sections being invoked from within extended quiescent states,
for example:

	RCU used illegally from idle CPU!
	rcu_scheduler_active = 2, debug_locks = 1
	RCU used illegally from extended quiescent state!

This commit therefore saves a couple lines of code and one line of
console-log output by eliminating the first of these two complaints.

Link: https://lore.kernel.org/lkml/87wo4wnpzb.fsf@nanos.tec.linutronix.de
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Will Deacon <will@kernel.org>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 kernel/locking/lockdep.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
index 29a8de4c50b9..0a7549d159ed 100644
--- a/kernel/locking/lockdep.c
+++ b/kernel/locking/lockdep.c
@@ -5851,9 +5851,7 @@ void lockdep_rcu_suspicious(const char *file, const int line, const char *s)
 	pr_warn("\n%srcu_scheduler_active = %d, debug_locks = %d\n",
 	       !rcu_lockdep_current_cpu_online()
 			? "RCU used illegally from offline CPU!\n"
-			: !rcu_is_watching()
-				? "RCU used illegally from idle CPU!\n"
-				: "",
+			: "",
 	       rcu_scheduler_active, debug_locks);
 
 	/*

From e40bb921119814c6f746891af9cd37eccda616a4 Mon Sep 17 00:00:00 2001
From: Jules Irenge <jbi.octave@gmail.com>
Date: Mon, 1 Jun 2020 19:45:49 +0100
Subject: [PATCH 103/502] rcu: Replace 1 with true

Coccinelle reports a warning

WARNING: Assignment of 0/1 to bool variable

The root cause is that the variable lastphase is a bool, but is
initialised with integer 1.  This commit therefore replaces the 1 with
a true.

Signed-off-by: Jules Irenge <jbi.octave@gmail.com>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 kernel/rcu/update.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c
index ca17b771ad60..a0ba8858dd35 100644
--- a/kernel/rcu/update.c
+++ b/kernel/rcu/update.c
@@ -207,7 +207,7 @@ void rcu_end_inkernel_boot(void)
 	rcu_unexpedite_gp();
 	if (rcu_normal_after_boot)
 		WRITE_ONCE(rcu_normal, 1);
-	rcu_boot_ended = 1;
+	rcu_boot_ended = true;
 }
 
 /*

From c6dfd72b7a3b70a2054db0f73245ea2f762a8452 Mon Sep 17 00:00:00 2001
From: Peter Enderborg <peter.enderborg@sony.com>
Date: Thu, 4 Jun 2020 12:23:20 +0200
Subject: [PATCH 104/502] rcu: Stop shrinker loop

The count and scan can be separated in time, and there is a fair chance
that all work is already done when the scan starts, which might in turn
result in a needless retry.  This commit therefore avoids this retry by
returning SHRINK_STOP.

Reviewed-by: Uladzislau Rezki (Sony) <urezki@gmail.com>
Signed-off-by: Peter Enderborg <peter.enderborg@sony.com>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 kernel/rcu/tree.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index d17e5a08bf43..c8196fab563c 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -3332,7 +3332,7 @@ kfree_rcu_shrink_scan(struct shrinker *shrink, struct shrink_control *sc)
 			break;
 	}
 
-	return freed;
+	return freed == 0 ? SHRINK_STOP : freed;
 }
 
 static struct shrinker kfree_rcu_shrinker = {

From 00943a609d7ad0f08e58bc9c214f38b0ba163c88 Mon Sep 17 00:00:00 2001
From: Wei Yang <richard.weiyang@linux.alibaba.com>
Date: Fri, 12 Jun 2020 10:07:52 +0800
Subject: [PATCH 105/502] rcu: gp_max is protected by root rcu_node's lock

Because gp_max is protected by root rcu_node's lock, this commit moves
the gp_max definition to the region of the rcu_node structure containing
fields protected by this lock.

Signed-off-by: Wei Yang <richard.weiyang@linux.alibaba.com>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 kernel/rcu/tree.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
index 575745f0a464..09ec93b16f28 100644
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -302,6 +302,8 @@ struct rcu_state {
 	u8	boost ____cacheline_internodealigned_in_smp;
 						/* Subject to priority boost. */
 	unsigned long gp_seq;			/* Grace-period sequence #. */
+	unsigned long gp_max;			/* Maximum GP duration in */
+						/*  jiffies. */
 	struct task_struct *gp_kthread;		/* Task for grace periods. */
 	struct swait_queue_head gp_wq;		/* Where GP task waits. */
 	short gp_flags;				/* Commands for GP task. */
@@ -347,8 +349,6 @@ struct rcu_state {
 						/*  a reluctant CPU. */
 	unsigned long n_force_qs_gpstart;	/* Snapshot of n_force_qs at */
 						/*  GP start. */
-	unsigned long gp_max;			/* Maximum GP duration in */
-						/*  jiffies. */
 	const char *name;			/* Name of structure. */
 	char abbr;				/* Abbreviated name. */
 

From a2dae43088d51c4869e7fa91ca09bcc890e277fc Mon Sep 17 00:00:00 2001
From: Wei Yang <richard.weiyang@linux.alibaba.com>
Date: Fri, 12 Jun 2020 10:07:53 +0800
Subject: [PATCH 106/502] rcu: grplo/grphi just records CPU number

The ->grplo and ->grphi fields store the lowest and highest CPU number
covered by to a rcu_node structure, which is not the group number.
This commit therefore adjusts these fields' comments to match reality.

Signed-off-by: Wei Yang <richard.weiyang@linux.alibaba.com>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 kernel/rcu/tree.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
index 09ec93b16f28..9f903f5c9fa1 100644
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -73,8 +73,8 @@ struct rcu_node {
 	unsigned long ffmask;	/* Fully functional CPUs. */
 	unsigned long grpmask;	/* Mask to apply to parent qsmask. */
 				/*  Only one bit will be set in this mask. */
-	int	grplo;		/* lowest-numbered CPU or group here. */
-	int	grphi;		/* highest-numbered CPU or group here. */
+	int	grplo;		/* lowest-numbered CPU here. */
+	int	grphi;		/* highest-numbered CPU here. */
 	u8	grpnum;		/* CPU/group number for next level up. */
 	u8	level;		/* root is at level 0. */
 	bool	wait_blkd_tasks;/* Necessary to wait for blocked tasks to */

From 7a0c2b0940c13a06573320ab7118375b35feef8b Mon Sep 17 00:00:00 2001
From: Wei Yang <richard.weiyang@linux.alibaba.com>
Date: Fri, 12 Jun 2020 10:07:54 +0800
Subject: [PATCH 107/502] rcu: grpnum just records group number

The ->grpnum field in the rcu_node structure contains the bit position
in this structure's parent's bitmasks, which is not the CPU number.
This commit therefore adjusts this field's comment accordingly.

Signed-off-by: Wei Yang <richard.weiyang@linux.alibaba.com>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 kernel/rcu/tree.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
index 9f903f5c9fa1..c96ae351688b 100644
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -75,7 +75,7 @@ struct rcu_node {
 				/*  Only one bit will be set in this mask. */
 	int	grplo;		/* lowest-numbered CPU here. */
 	int	grphi;		/* highest-numbered CPU here. */
-	u8	grpnum;		/* CPU/group number for next level up. */
+	u8	grpnum;		/* group number for next level up. */
 	u8	level;		/* root is at level 0. */
 	bool	wait_blkd_tasks;/* Necessary to wait for blocked tasks to */
 				/*  exit RCU read-side critical sections */

From c3cb47a6cc74af0b79579ba167d7124eb669fbaa Mon Sep 17 00:00:00 2001
From: Randy Dunlap <rdunlap@infradead.org>
Date: Mon, 15 Jun 2020 12:28:05 -0700
Subject: [PATCH 108/502] kernel/rcu/tree.c: Fix kernel-doc warnings

Fix kernel-doc warning:

../kernel/rcu/tree.c:959: warning: Excess function parameter 'irq' description in 'rcu_nmi_enter'

Fixes: cf7614e13c8f ("rcu: Refactor rcu_{nmi,irq}_{enter,exit}()")
Signed-off-by: Randy Dunlap <rdunlap@infradead.org>
Cc: Byungchul Park <byungchul.park@lge.com>
Cc: Joel Fernandes (Google) <joel@joelfernandes.org>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 kernel/rcu/tree.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index c8196fab563c..ef05aac7f9d3 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -954,7 +954,6 @@ void __rcu_irq_enter_check_tick(void)
 
 /**
  * rcu_nmi_enter - inform RCU of entry to NMI context
- * @irq: Is this call from rcu_irq_enter?
  *
  * If the CPU was idle from RCU's viewpoint, update rdp->dynticks and
  * rdp->dynticks_nmi_nesting to let the RCU grace-period handling know

From 24692fa22c30cb8fcfcabdc07a3c82964475b639 Mon Sep 17 00:00:00 2001
From: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
Date: Mon, 15 Jun 2020 08:46:49 +0200
Subject: [PATCH 109/502] rcu: Fix some kernel-doc warnings

The current code provokes some kernel-doc warnings:

	./kernel/rcu/tree.c:2915: warning: Function parameter or member 'count' not described in 'kfree_rcu_cpu'
	./include/linux/rculist.h:517: warning: bad line:                           [@right ][node2 ... ]
	./include/linux/rculist.h:2: WARNING: Unexpected indentation.

This commit therefore moves the comment for "count" to the kernel-doc
markup and adds a missing "*" on one kernel-doc continuation line.

Signed-off-by: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 include/linux/rculist.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/linux/rculist.h b/include/linux/rculist.h
index df587d181844..7eed65b5f713 100644
--- a/include/linux/rculist.h
+++ b/include/linux/rculist.h
@@ -512,7 +512,7 @@ static inline void hlist_replace_rcu(struct hlist_node *old,
  * @right: The hlist head on the right
  *
  * The lists start out as [@left  ][node1 ... ] and
-                          [@right ][node2 ... ]
+ *                        [@right ][node2 ... ]
  * The lists end up as    [@left  ][node2 ... ]
  *                        [@right ][node1 ... ]
  */

From 8e11690d2f5a9823d66f68918c3986b4e9e160ab Mon Sep 17 00:00:00 2001
From: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
Date: Mon, 4 May 2020 14:35:00 +0200
Subject: [PATCH 110/502] rcu: Fix a kernel-doc warnings for "count"

There are some kernel-doc warnings:

	./kernel/rcu/tree.c:2915: warning: Function parameter or member 'count' not described in 'kfree_rcu_cpu'

This commit therefore moves the comment for "count" to the kernel-doc
markup.

Signed-off-by: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 kernel/rcu/tree.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 6c6569e0586c..ba4c477495b5 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -3004,6 +3004,7 @@ struct kfree_rcu_cpu_work {
  * @monitor_work: Promote @head to @head_free after KFREE_DRAIN_JIFFIES
  * @monitor_todo: Tracks whether a @monitor_work delayed work is pending
  * @initialized: The @lock and @rcu_work fields have been initialized
+ * @count: Number of objects for which GP not started
  *
  * This is a per-CPU structure.  The reason that it is not included in
  * the rcu_data structure is to permit this code to be extracted from
@@ -3019,7 +3020,6 @@ struct kfree_rcu_cpu {
 	struct delayed_work monitor_work;
 	bool monitor_todo;
 	bool initialized;
-	// Number of objects for which GP not started
 	int count;
 };
 

From 8ac88f7177c75bf9b7b8c29a8054115e1c712baf Mon Sep 17 00:00:00 2001
From: "Joel Fernandes (Google)" <joel@joelfernandes.org>
Date: Mon, 25 May 2020 23:47:45 +0200
Subject: [PATCH 111/502] rcu/tree: Keep kfree_rcu() awake during lock
 contention

On PREEMPT_RT kernels, the krcp spinlock gets converted to an rt-mutex
and causes kfree_rcu() callers to sleep. This makes it unusable for
callers in purely atomic sections such as non-threaded IRQ handlers and
raw spinlock sections. Fix it by converting the spinlock to a raw
spinlock.

Vetting all code paths, there is no reason to believe that the raw
spinlock will hurt RT latencies as it is not held for a long time.

Cc: bigeasy@linutronix.de
Cc: Uladzislau Rezki <urezki@gmail.com>
Reviewed-by: Uladzislau Rezki <urezki@gmail.com>
Signed-off-by: Joel Fernandes (Google) <joel@joelfernandes.org>
Signed-off-by: Uladzislau Rezki (Sony) <urezki@gmail.com>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 kernel/rcu/tree.c | 30 +++++++++++++++---------------
 1 file changed, 15 insertions(+), 15 deletions(-)

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index ba4c477495b5..c5de5adca0dd 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -3016,7 +3016,7 @@ struct kfree_rcu_cpu {
 	struct kfree_rcu_bulk_data *bhead;
 	struct kfree_rcu_bulk_data *bcached;
 	struct kfree_rcu_cpu_work krw_arr[KFREE_N_BATCHES];
-	spinlock_t lock;
+	raw_spinlock_t lock;
 	struct delayed_work monitor_work;
 	bool monitor_todo;
 	bool initialized;
@@ -3049,12 +3049,12 @@ static void kfree_rcu_work(struct work_struct *work)
 	krwp = container_of(to_rcu_work(work),
 			    struct kfree_rcu_cpu_work, rcu_work);
 	krcp = krwp->krcp;
-	spin_lock_irqsave(&krcp->lock, flags);
+	raw_spin_lock_irqsave(&krcp->lock, flags);
 	head = krwp->head_free;
 	krwp->head_free = NULL;
 	bhead = krwp->bhead_free;
 	krwp->bhead_free = NULL;
-	spin_unlock_irqrestore(&krcp->lock, flags);
+	raw_spin_unlock_irqrestore(&krcp->lock, flags);
 
 	/* "bhead" is now private, so traverse locklessly. */
 	for (; bhead; bhead = bnext) {
@@ -3157,14 +3157,14 @@ static inline void kfree_rcu_drain_unlock(struct kfree_rcu_cpu *krcp,
 	krcp->monitor_todo = false;
 	if (queue_kfree_rcu_work(krcp)) {
 		// Success! Our job is done here.
-		spin_unlock_irqrestore(&krcp->lock, flags);
+		raw_spin_unlock_irqrestore(&krcp->lock, flags);
 		return;
 	}
 
 	// Previous RCU batch still in progress, try again later.
 	krcp->monitor_todo = true;
 	schedule_delayed_work(&krcp->monitor_work, KFREE_DRAIN_JIFFIES);
-	spin_unlock_irqrestore(&krcp->lock, flags);
+	raw_spin_unlock_irqrestore(&krcp->lock, flags);
 }
 
 /*
@@ -3177,11 +3177,11 @@ static void kfree_rcu_monitor(struct work_struct *work)
 	struct kfree_rcu_cpu *krcp = container_of(work, struct kfree_rcu_cpu,
 						 monitor_work.work);
 
-	spin_lock_irqsave(&krcp->lock, flags);
+	raw_spin_lock_irqsave(&krcp->lock, flags);
 	if (krcp->monitor_todo)
 		kfree_rcu_drain_unlock(krcp, flags);
 	else
-		spin_unlock_irqrestore(&krcp->lock, flags);
+		raw_spin_unlock_irqrestore(&krcp->lock, flags);
 }
 
 static inline bool
@@ -3252,7 +3252,7 @@ void kfree_call_rcu(struct rcu_head *head, rcu_callback_t func)
 	local_irq_save(flags);	// For safely calling this_cpu_ptr().
 	krcp = this_cpu_ptr(&krc);
 	if (krcp->initialized)
-		spin_lock(&krcp->lock);
+		raw_spin_lock(&krcp->lock);
 
 	// Queue the object but don't yet schedule the batch.
 	if (debug_rcu_head_queue(head)) {
@@ -3283,7 +3283,7 @@ void kfree_call_rcu(struct rcu_head *head, rcu_callback_t func)
 
 unlock_return:
 	if (krcp->initialized)
-		spin_unlock(&krcp->lock);
+		raw_spin_unlock(&krcp->lock);
 	local_irq_restore(flags);
 }
 EXPORT_SYMBOL_GPL(kfree_call_rcu);
@@ -3315,11 +3315,11 @@ kfree_rcu_shrink_scan(struct shrinker *shrink, struct shrink_control *sc)
 		struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu);
 
 		count = krcp->count;
-		spin_lock_irqsave(&krcp->lock, flags);
+		raw_spin_lock_irqsave(&krcp->lock, flags);
 		if (krcp->monitor_todo)
 			kfree_rcu_drain_unlock(krcp, flags);
 		else
-			spin_unlock_irqrestore(&krcp->lock, flags);
+			raw_spin_unlock_irqrestore(&krcp->lock, flags);
 
 		sc->nr_to_scan -= count;
 		freed += count;
@@ -3346,15 +3346,15 @@ void __init kfree_rcu_scheduler_running(void)
 	for_each_online_cpu(cpu) {
 		struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu);
 
-		spin_lock_irqsave(&krcp->lock, flags);
+		raw_spin_lock_irqsave(&krcp->lock, flags);
 		if (!krcp->head || krcp->monitor_todo) {
-			spin_unlock_irqrestore(&krcp->lock, flags);
+			raw_spin_unlock_irqrestore(&krcp->lock, flags);
 			continue;
 		}
 		krcp->monitor_todo = true;
 		schedule_delayed_work_on(cpu, &krcp->monitor_work,
 					 KFREE_DRAIN_JIFFIES);
-		spin_unlock_irqrestore(&krcp->lock, flags);
+		raw_spin_unlock_irqrestore(&krcp->lock, flags);
 	}
 }
 
@@ -4250,7 +4250,7 @@ static void __init kfree_rcu_batch_init(void)
 	for_each_possible_cpu(cpu) {
 		struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu);
 
-		spin_lock_init(&krcp->lock);
+		raw_spin_lock_init(&krcp->lock);
 		for (i = 0; i < KFREE_N_BATCHES; i++) {
 			INIT_RCU_WORK(&krcp->krw_arr[i].rcu_work, kfree_rcu_work);
 			krcp->krw_arr[i].krcp = krcp;

From 4d2919411867848fab78c7cb13139e17ad8b85bc Mon Sep 17 00:00:00 2001
From: "Joel Fernandes (Google)" <joel@joelfernandes.org>
Date: Mon, 25 May 2020 23:47:46 +0200
Subject: [PATCH 112/502] rcu/tree: Skip entry into the page allocator for
 PREEMPT_RT

To keep the kfree_rcu() code working in purely atomic sections on RT,
such as non-threaded IRQ handlers and raw spinlock sections, avoid
calling into the page allocator which uses sleeping locks on RT.

In fact, even if the  caller is preemptible, the kfree_rcu() code is
not, as the krcp->lock is a raw spinlock.

Calling into the page allocator is optional and avoiding it should be
Ok, especially with the page pre-allocation support in future patches.
Such pre-allocation would further avoid the a need for a dynamically
allocated page in the first place.

Cc: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Reviewed-by: Uladzislau Rezki <urezki@gmail.com>
Co-developed-by: Uladzislau Rezki <urezki@gmail.com>
Signed-off-by: Uladzislau Rezki <urezki@gmail.com>
Signed-off-by: Joel Fernandes (Google) <joel@joelfernandes.org>
Signed-off-by: Uladzislau Rezki (Sony) <urezki@gmail.com>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 kernel/rcu/tree.c | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index c5de5adca0dd..e0425faf3b3b 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -3202,6 +3202,18 @@ kfree_call_rcu_add_ptr_to_bulk(struct kfree_rcu_cpu *krcp,
 		if (!bnode) {
 			WARN_ON_ONCE(sizeof(struct kfree_rcu_bulk_data) > PAGE_SIZE);
 
+			/*
+			 * To keep this path working on raw non-preemptible
+			 * sections, prevent the optional entry into the
+			 * allocator as it uses sleeping locks. In fact, even
+			 * if the caller of kfree_rcu() is preemptible, this
+			 * path still is not, as krcp->lock is a raw spinlock.
+			 * With additional page pre-allocation in the works,
+			 * hitting this return is going to be much less likely.
+			 */
+			if (IS_ENABLED(CONFIG_PREEMPT_RT))
+				return false;
+
 			bnode = (struct kfree_rcu_bulk_data *)
 				__get_free_page(GFP_NOWAIT | __GFP_NOWARN);
 		}

From 594aa5975b9b5cfe9edaec06170e43b8c0607377 Mon Sep 17 00:00:00 2001
From: "Uladzislau Rezki (Sony)" <urezki@gmail.com>
Date: Mon, 25 May 2020 23:47:47 +0200
Subject: [PATCH 113/502] rcu/tree: Repeat the monitor if any free channel is
 busy

It is possible that one of the channels cannot be detached
because its free channel is busy and previously queued data
has not been processed yet. On the other hand, another
channel can be successfully detached causing the monitor
work to stop.

Prevent that by rescheduling the monitor work if there are
any channels in the pending state after a detach attempt.

Fixes: 34c881745549e ("rcu: Support kfree_bulk() interface in kfree_rcu()")
Acked-by: Joel Fernandes (Google) <joel@joelfernandes.org>
Signed-off-by: Uladzislau Rezki (Sony) <urezki@gmail.com>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 kernel/rcu/tree.c | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index e0425faf3b3b..5151fe4e1429 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -3105,7 +3105,7 @@ static void kfree_rcu_work(struct work_struct *work)
 static inline bool queue_kfree_rcu_work(struct kfree_rcu_cpu *krcp)
 {
 	struct kfree_rcu_cpu_work *krwp;
-	bool queued = false;
+	bool repeat = false;
 	int i;
 
 	lockdep_assert_held(&krcp->lock);
@@ -3143,11 +3143,14 @@ static inline bool queue_kfree_rcu_work(struct kfree_rcu_cpu *krcp)
 			 * been detached following each other, one by one.
 			 */
 			queue_rcu_work(system_wq, &krwp->rcu_work);
-			queued = true;
 		}
+
+		/* Repeat if any "free" corresponding channel is still busy. */
+		if (krcp->bhead || krcp->head)
+			repeat = true;
 	}
 
-	return queued;
+	return !repeat;
 }
 
 static inline void kfree_rcu_drain_unlock(struct kfree_rcu_cpu *krcp,

From 446044eb9c9c335d3ae1be4665193ab43ebb284e Mon Sep 17 00:00:00 2001
From: "Joel Fernandes (Google)" <joel@joelfernandes.org>
Date: Mon, 25 May 2020 23:47:48 +0200
Subject: [PATCH 114/502] rcu/tree: Make debug_objects logic independent of
 rcu_head

kfree_rcu()'s debug_objects logic uses the address of the object's
embedded rcu_head to queue/unqueue. Instead of this, make use of the
object's address itself as preparation for future headless kfree_rcu()
support.

Reviewed-by: Uladzislau Rezki <urezki@gmail.com>
Signed-off-by: Uladzislau Rezki (Sony) <urezki@gmail.com>
Signed-off-by: Joel Fernandes (Google) <joel@joelfernandes.org>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 kernel/rcu/tree.c | 29 +++++++++++++----------------
 1 file changed, 13 insertions(+), 16 deletions(-)

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 5151fe4e1429..143c1e9265b6 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -2970,13 +2970,11 @@ EXPORT_SYMBOL_GPL(call_rcu);
  * @nr_records: Number of active pointers in the array
  * @records: Array of the kfree_rcu() pointers
  * @next: Next bulk object in the block chain
- * @head_free_debug: For debug, when CONFIG_DEBUG_OBJECTS_RCU_HEAD is set
  */
 struct kfree_rcu_bulk_data {
 	unsigned long nr_records;
 	void *records[KFREE_BULK_MAX_ENTR];
 	struct kfree_rcu_bulk_data *next;
-	struct rcu_head *head_free_debug;
 };
 
 /**
@@ -3026,11 +3024,13 @@ struct kfree_rcu_cpu {
 static DEFINE_PER_CPU(struct kfree_rcu_cpu, krc);
 
 static __always_inline void
-debug_rcu_head_unqueue_bulk(struct rcu_head *head)
+debug_rcu_bhead_unqueue(struct kfree_rcu_bulk_data *bhead)
 {
 #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD
-	for (; head; head = head->next)
-		debug_rcu_head_unqueue(head);
+	int i;
+
+	for (i = 0; i < bhead->nr_records; i++)
+		debug_rcu_head_unqueue((struct rcu_head *)(bhead->records[i]));
 #endif
 }
 
@@ -3060,7 +3060,7 @@ static void kfree_rcu_work(struct work_struct *work)
 	for (; bhead; bhead = bnext) {
 		bnext = bhead->next;
 
-		debug_rcu_head_unqueue_bulk(bhead->head_free_debug);
+		debug_rcu_bhead_unqueue(bhead);
 
 		rcu_lock_acquire(&rcu_callback_map);
 		trace_rcu_invoke_kfree_bulk_callback(rcu_state.name,
@@ -3082,14 +3082,15 @@ static void kfree_rcu_work(struct work_struct *work)
 	 */
 	for (; head; head = next) {
 		unsigned long offset = (unsigned long)head->func;
+		void *ptr = (void *)head - offset;
 
 		next = head->next;
-		debug_rcu_head_unqueue(head);
+		debug_rcu_head_unqueue((struct rcu_head *)ptr);
 		rcu_lock_acquire(&rcu_callback_map);
 		trace_rcu_invoke_kfree_callback(rcu_state.name, head, offset);
 
 		if (!WARN_ON_ONCE(!__is_kfree_rcu_offset(offset)))
-			kfree((void *)head - offset);
+			kfree(ptr);
 
 		rcu_lock_release(&rcu_callback_map);
 		cond_resched_tasks_rcu_qs();
@@ -3228,18 +3229,11 @@ kfree_call_rcu_add_ptr_to_bulk(struct kfree_rcu_cpu *krcp,
 		/* Initialize the new block. */
 		bnode->nr_records = 0;
 		bnode->next = krcp->bhead;
-		bnode->head_free_debug = NULL;
 
 		/* Attach it to the head. */
 		krcp->bhead = bnode;
 	}
 
-#ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD
-	head->func = func;
-	head->next = krcp->bhead->head_free_debug;
-	krcp->bhead->head_free_debug = head;
-#endif
-
 	/* Finally insert. */
 	krcp->bhead->records[krcp->bhead->nr_records++] =
 		(void *) head - (unsigned long) func;
@@ -3263,14 +3257,17 @@ void kfree_call_rcu(struct rcu_head *head, rcu_callback_t func)
 {
 	unsigned long flags;
 	struct kfree_rcu_cpu *krcp;
+	void *ptr;
 
 	local_irq_save(flags);	// For safely calling this_cpu_ptr().
 	krcp = this_cpu_ptr(&krc);
 	if (krcp->initialized)
 		raw_spin_lock(&krcp->lock);
 
+	ptr = (void *)head - (unsigned long)func;
+
 	// Queue the object but don't yet schedule the batch.
-	if (debug_rcu_head_queue(head)) {
+	if (debug_rcu_head_queue(ptr)) {
 		// Probable double kfree_rcu(), just leak.
 		WARN_ONCE(1, "%s(): Double-freed call. rcu_head %p\n",
 			  __func__, head);

From 3af84862817403d317dc33312e7a88d76e79401a Mon Sep 17 00:00:00 2001
From: "Uladzislau Rezki (Sony)" <urezki@gmail.com>
Date: Mon, 25 May 2020 23:47:49 +0200
Subject: [PATCH 115/502] rcu/tree: Simplify KFREE_BULK_MAX_ENTR macro

We can simplify KFREE_BULK_MAX_ENTR macro and get rid of
magic numbers which were used to make the structure to be
exactly one page.

Suggested-by: Boqun Feng <boqun.feng@gmail.com>
Reviewed-by: Joel Fernandes (Google) <joel@joelfernandes.org>
Signed-off-by: Uladzislau Rezki (Sony) <urezki@gmail.com>
Signed-off-by: Joel Fernandes (Google) <joel@joelfernandes.org>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 kernel/rcu/tree.c | 17 +++++++++--------
 1 file changed, 9 insertions(+), 8 deletions(-)

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 143c1e9265b6..bcdc06364426 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -2958,13 +2958,6 @@ EXPORT_SYMBOL_GPL(call_rcu);
 #define KFREE_DRAIN_JIFFIES (HZ / 50)
 #define KFREE_N_BATCHES 2
 
-/*
- * This macro defines how many entries the "records" array
- * will contain. It is based on the fact that the size of
- * kfree_rcu_bulk_data structure becomes exactly one page.
- */
-#define KFREE_BULK_MAX_ENTR ((PAGE_SIZE / sizeof(void *)) - 3)
-
 /**
  * struct kfree_rcu_bulk_data - single block to store kfree_rcu() pointers
  * @nr_records: Number of active pointers in the array
@@ -2973,10 +2966,18 @@ EXPORT_SYMBOL_GPL(call_rcu);
  */
 struct kfree_rcu_bulk_data {
 	unsigned long nr_records;
-	void *records[KFREE_BULK_MAX_ENTR];
 	struct kfree_rcu_bulk_data *next;
+	void *records[];
 };
 
+/*
+ * This macro defines how many entries the "records" array
+ * will contain. It is based on the fact that the size of
+ * kfree_rcu_bulk_data structure becomes exactly one page.
+ */
+#define KFREE_BULK_MAX_ENTR \
+	((PAGE_SIZE - sizeof(struct kfree_rcu_bulk_data)) / sizeof(void *))
+
 /**
  * struct kfree_rcu_cpu_work - single batch of kfree_rcu() requests
  * @rcu_work: Let queue_rcu_work() invoke workqueue handler after grace period

From 952371d6fc0bc360d1d5780f86bb355836117ca2 Mon Sep 17 00:00:00 2001
From: "Uladzislau Rezki (Sony)" <urezki@gmail.com>
Date: Mon, 25 May 2020 23:47:50 +0200
Subject: [PATCH 116/502] rcu/tree: Move kfree_rcu_cpu locking/unlocking to
 separate functions

Introduce helpers to lock and unlock per-cpu "kfree_rcu_cpu"
structures. That will make kfree_call_rcu() more readable
and prevent programming errors.

Reviewed-by: Joel Fernandes (Google) <joel@joelfernandes.org>
Signed-off-by: Uladzislau Rezki (Sony) <urezki@gmail.com>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 kernel/rcu/tree.c | 31 +++++++++++++++++++++++--------
 1 file changed, 23 insertions(+), 8 deletions(-)

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index bcdc06364426..368bdc441ffb 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -3035,6 +3035,27 @@ debug_rcu_bhead_unqueue(struct kfree_rcu_bulk_data *bhead)
 #endif
 }
 
+static inline struct kfree_rcu_cpu *
+krc_this_cpu_lock(unsigned long *flags)
+{
+	struct kfree_rcu_cpu *krcp;
+
+	local_irq_save(*flags);	// For safely calling this_cpu_ptr().
+	krcp = this_cpu_ptr(&krc);
+	if (likely(krcp->initialized))
+		raw_spin_lock(&krcp->lock);
+
+	return krcp;
+}
+
+static inline void
+krc_this_cpu_unlock(struct kfree_rcu_cpu *krcp, unsigned long flags)
+{
+	if (likely(krcp->initialized))
+		raw_spin_unlock(&krcp->lock);
+	local_irq_restore(flags);
+}
+
 /*
  * This function is invoked in workqueue context after a grace period.
  * It frees all the objects queued on ->bhead_free or ->head_free.
@@ -3260,11 +3281,7 @@ void kfree_call_rcu(struct rcu_head *head, rcu_callback_t func)
 	struct kfree_rcu_cpu *krcp;
 	void *ptr;
 
-	local_irq_save(flags);	// For safely calling this_cpu_ptr().
-	krcp = this_cpu_ptr(&krc);
-	if (krcp->initialized)
-		raw_spin_lock(&krcp->lock);
-
+	krcp = krc_this_cpu_lock(&flags);
 	ptr = (void *)head - (unsigned long)func;
 
 	// Queue the object but don't yet schedule the batch.
@@ -3295,9 +3312,7 @@ void kfree_call_rcu(struct rcu_head *head, rcu_callback_t func)
 	}
 
 unlock_return:
-	if (krcp->initialized)
-		raw_spin_unlock(&krcp->lock);
-	local_irq_restore(flags);
+	krc_this_cpu_unlock(krcp, flags);
 }
 EXPORT_SYMBOL_GPL(kfree_call_rcu);
 

From 69f08d3999dbef1553a3332b8055282dd3893b6c Mon Sep 17 00:00:00 2001
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Mon, 25 May 2020 23:47:51 +0200
Subject: [PATCH 117/502] rcu/tree: Use static initializer for krc.lock

The per-CPU variable is initialized at runtime in
kfree_rcu_batch_init(). This function is invoked before
'rcu_scheduler_active' is set to 'RCU_SCHEDULER_RUNNING'.
After the initialisation, '->initialized' is to true.

The raw_spin_lock is only acquired if '->initialized' is
set to true. The worqueue item is only used if 'rcu_scheduler_active'
set to RCU_SCHEDULER_RUNNING which happens after initialisation.

Use a static initializer for krc.lock and remove the runtime
initialisation of the lock. Since the lock can now be always
acquired, remove the '->initialized' check.

Cc: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Signed-off-by: Uladzislau Rezki (Sony) <urezki@gmail.com>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 kernel/rcu/tree.c | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 368bdc441ffb..a42a4693f161 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -3002,7 +3002,7 @@ struct kfree_rcu_cpu_work {
  * @lock: Synchronize access to this structure
  * @monitor_work: Promote @head to @head_free after KFREE_DRAIN_JIFFIES
  * @monitor_todo: Tracks whether a @monitor_work delayed work is pending
- * @initialized: The @lock and @rcu_work fields have been initialized
+ * @initialized: The @rcu_work fields have been initialized
  * @count: Number of objects for which GP not started
  *
  * This is a per-CPU structure.  The reason that it is not included in
@@ -3022,7 +3022,9 @@ struct kfree_rcu_cpu {
 	int count;
 };
 
-static DEFINE_PER_CPU(struct kfree_rcu_cpu, krc);
+static DEFINE_PER_CPU(struct kfree_rcu_cpu, krc) = {
+	.lock = __RAW_SPIN_LOCK_UNLOCKED(krc.lock),
+};
 
 static __always_inline void
 debug_rcu_bhead_unqueue(struct kfree_rcu_bulk_data *bhead)
@@ -3042,8 +3044,7 @@ krc_this_cpu_lock(unsigned long *flags)
 
 	local_irq_save(*flags);	// For safely calling this_cpu_ptr().
 	krcp = this_cpu_ptr(&krc);
-	if (likely(krcp->initialized))
-		raw_spin_lock(&krcp->lock);
+	raw_spin_lock(&krcp->lock);
 
 	return krcp;
 }
@@ -3051,8 +3052,7 @@ krc_this_cpu_lock(unsigned long *flags)
 static inline void
 krc_this_cpu_unlock(struct kfree_rcu_cpu *krcp, unsigned long flags)
 {
-	if (likely(krcp->initialized))
-		raw_spin_unlock(&krcp->lock);
+	raw_spin_unlock(&krcp->lock);
 	local_irq_restore(flags);
 }
 
@@ -4278,7 +4278,6 @@ static void __init kfree_rcu_batch_init(void)
 	for_each_possible_cpu(cpu) {
 		struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu);
 
-		raw_spin_lock_init(&krcp->lock);
 		for (i = 0; i < KFREE_N_BATCHES; i++) {
 			INIT_RCU_WORK(&krcp->krw_arr[i].rcu_work, kfree_rcu_work);
 			krcp->krw_arr[i].krcp = krcp;

From 53c72b590b3a0afd6747d6f7957e6838003e90a4 Mon Sep 17 00:00:00 2001
From: "Uladzislau Rezki (Sony)" <urezki@gmail.com>
Date: Mon, 25 May 2020 23:47:52 +0200
Subject: [PATCH 118/502] rcu/tree: cache specified number of objects

In order to reduce the dynamic need for pages in kfree_rcu(),
pre-allocate a configurable number of pages per CPU and link
them in a list. When kfree_rcu() reclaims objects, the object's
container page is cached into a list instead of being released
to the low-level page allocator.

Such an approach provides O(1) access to free pages while also
reducing the number of requests to the page allocator. It also
makes the kfree_rcu() code to have free pages available during
a low memory condition.

A read-only sysfs parameter (rcu_min_cached_objs) reflects the
minimum number of allowed cached pages per CPU.

Signed-off-by: Uladzislau Rezki (Sony) <urezki@gmail.com>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 .../admin-guide/kernel-parameters.txt         |  8 +++
 kernel/rcu/tree.c                             | 66 +++++++++++++++++--
 2 files changed, 70 insertions(+), 4 deletions(-)

diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index fb95fad81c79..befaa63652ff 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -4038,6 +4038,14 @@
 			latencies, which will choose a value aligned
 			with the appropriate hardware boundaries.
 
+	rcutree.rcu_min_cached_objs= [KNL]
+			Minimum number of objects which are cached and
+			maintained per one CPU. Object size is equal
+			to PAGE_SIZE. The cache allows to reduce the
+			pressure to page allocator, also it makes the
+			whole algorithm to behave better in low memory
+			condition.
+
 	rcutree.jiffies_till_first_fqs= [KNL]
 			Set delay from grace-period initialization to
 			first attempt to force quiescent states.
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index a42a4693f161..37c0cd0332f8 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -175,6 +175,15 @@ module_param(gp_init_delay, int, 0444);
 static int gp_cleanup_delay;
 module_param(gp_cleanup_delay, int, 0444);
 
+/*
+ * This rcu parameter is runtime-read-only. It reflects
+ * a minimum allowed number of objects which can be cached
+ * per-CPU. Object size is equal to one page. This value
+ * can be changed at boot time.
+ */
+static int rcu_min_cached_objs = 2;
+module_param(rcu_min_cached_objs, int, 0444);
+
 /* Retrieve RCU kthreads priority for rcutorture */
 int rcu_get_gp_kthreads_prio(void)
 {
@@ -2997,7 +3006,6 @@ struct kfree_rcu_cpu_work {
  * struct kfree_rcu_cpu - batch up kfree_rcu() requests for RCU grace period
  * @head: List of kfree_rcu() objects not yet waiting for a grace period
  * @bhead: Bulk-List of kfree_rcu() objects not yet waiting for a grace period
- * @bcached: Keeps at most one object for later reuse when build chain blocks
  * @krw_arr: Array of batches of kfree_rcu() objects waiting for a grace period
  * @lock: Synchronize access to this structure
  * @monitor_work: Promote @head to @head_free after KFREE_DRAIN_JIFFIES
@@ -3013,13 +3021,22 @@ struct kfree_rcu_cpu_work {
 struct kfree_rcu_cpu {
 	struct rcu_head *head;
 	struct kfree_rcu_bulk_data *bhead;
-	struct kfree_rcu_bulk_data *bcached;
 	struct kfree_rcu_cpu_work krw_arr[KFREE_N_BATCHES];
 	raw_spinlock_t lock;
 	struct delayed_work monitor_work;
 	bool monitor_todo;
 	bool initialized;
 	int count;
+
+	/*
+	 * A simple cache list that contains objects for
+	 * reuse purpose. In order to save some per-cpu
+	 * space the list is singular. Even though it is
+	 * lockless an access has to be protected by the
+	 * per-cpu lock.
+	 */
+	struct llist_head bkvcache;
+	int nr_bkv_objs;
 };
 
 static DEFINE_PER_CPU(struct kfree_rcu_cpu, krc) = {
@@ -3056,6 +3073,31 @@ krc_this_cpu_unlock(struct kfree_rcu_cpu *krcp, unsigned long flags)
 	local_irq_restore(flags);
 }
 
+static inline struct kfree_rcu_bulk_data *
+get_cached_bnode(struct kfree_rcu_cpu *krcp)
+{
+	if (!krcp->nr_bkv_objs)
+		return NULL;
+
+	krcp->nr_bkv_objs--;
+	return (struct kfree_rcu_bulk_data *)
+		llist_del_first(&krcp->bkvcache);
+}
+
+static inline bool
+put_cached_bnode(struct kfree_rcu_cpu *krcp,
+	struct kfree_rcu_bulk_data *bnode)
+{
+	// Check the limit.
+	if (krcp->nr_bkv_objs >= rcu_min_cached_objs)
+		return false;
+
+	llist_add((struct llist_node *) bnode, &krcp->bkvcache);
+	krcp->nr_bkv_objs++;
+	return true;
+
+}
+
 /*
  * This function is invoked in workqueue context after a grace period.
  * It frees all the objects queued on ->bhead_free or ->head_free.
@@ -3091,7 +3133,12 @@ static void kfree_rcu_work(struct work_struct *work)
 		kfree_bulk(bhead->nr_records, bhead->records);
 		rcu_lock_release(&rcu_callback_map);
 
-		if (cmpxchg(&krcp->bcached, NULL, bhead))
+		krcp = krc_this_cpu_lock(&flags);
+		if (put_cached_bnode(krcp, bhead))
+			bhead = NULL;
+		krc_this_cpu_unlock(krcp, flags);
+
+		if (bhead)
 			free_page((unsigned long) bhead);
 
 		cond_resched_tasks_rcu_qs();
@@ -3224,7 +3271,7 @@ kfree_call_rcu_add_ptr_to_bulk(struct kfree_rcu_cpu *krcp,
 	/* Check if a new block is required. */
 	if (!krcp->bhead ||
 			krcp->bhead->nr_records == KFREE_BULK_MAX_ENTR) {
-		bnode = xchg(&krcp->bcached, NULL);
+		bnode = get_cached_bnode(krcp);
 		if (!bnode) {
 			WARN_ON_ONCE(sizeof(struct kfree_rcu_bulk_data) > PAGE_SIZE);
 
@@ -4277,12 +4324,23 @@ static void __init kfree_rcu_batch_init(void)
 
 	for_each_possible_cpu(cpu) {
 		struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu);
+		struct kfree_rcu_bulk_data *bnode;
 
 		for (i = 0; i < KFREE_N_BATCHES; i++) {
 			INIT_RCU_WORK(&krcp->krw_arr[i].rcu_work, kfree_rcu_work);
 			krcp->krw_arr[i].krcp = krcp;
 		}
 
+		for (i = 0; i < rcu_min_cached_objs; i++) {
+			bnode = (struct kfree_rcu_bulk_data *)
+				__get_free_page(GFP_NOWAIT | __GFP_NOWARN);
+
+			if (bnode)
+				put_cached_bnode(krcp, bnode);
+			else
+				pr_err("Failed to preallocate for %d CPU!\n", cpu);
+		}
+
 		INIT_DELAYED_WORK(&krcp->monitor_work, kfree_rcu_monitor);
 		krcp->initialized = true;
 	}

From 5f3c8d620447d509e534962e23f7edfb85f4e533 Mon Sep 17 00:00:00 2001
From: "Uladzislau Rezki (Sony)" <urezki@gmail.com>
Date: Mon, 25 May 2020 23:47:53 +0200
Subject: [PATCH 119/502] rcu/tree: Maintain separate array for vmalloc ptrs

To do so, we use an array of kvfree_rcu_bulk_data structures.
It consists of two elements:
 - index number 0 corresponds to slab pointers.
 - index number 1 corresponds to vmalloc pointers.

Keeping vmalloc pointers separated from slab pointers makes
it possible to invoke the right freeing API for the right
kind of pointer.

It also prepares us for future headless support for vmalloc
and SLAB objects. Such objects cannot be queued on a linked
list and are instead directly into an array.

Signed-off-by: Uladzislau Rezki (Sony) <urezki@gmail.com>
Signed-off-by: Joel Fernandes (Google) <joel@joelfernandes.org>
Reviewed-by: Joel Fernandes (Google) <joel@joelfernandes.org>
Co-developed-by: Joel Fernandes (Google) <joel@joelfernandes.org>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 kernel/rcu/tree.c | 167 +++++++++++++++++++++++++++-------------------
 1 file changed, 97 insertions(+), 70 deletions(-)

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 37c0cd0332f8..67c4b984c499 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -57,6 +57,8 @@
 #include <linux/slab.h>
 #include <linux/sched/isolation.h>
 #include <linux/sched/clock.h>
+#include <linux/vmalloc.h>
+#include <linux/mm.h>
 #include "../time/tick-internal.h"
 
 #include "tree.h"
@@ -2966,46 +2968,47 @@ EXPORT_SYMBOL_GPL(call_rcu);
 /* Maximum number of jiffies to wait before draining a batch. */
 #define KFREE_DRAIN_JIFFIES (HZ / 50)
 #define KFREE_N_BATCHES 2
+#define FREE_N_CHANNELS 2
 
 /**
- * struct kfree_rcu_bulk_data - single block to store kfree_rcu() pointers
+ * struct kvfree_rcu_bulk_data - single block to store kvfree_rcu() pointers
  * @nr_records: Number of active pointers in the array
- * @records: Array of the kfree_rcu() pointers
  * @next: Next bulk object in the block chain
+ * @records: Array of the kvfree_rcu() pointers
  */
-struct kfree_rcu_bulk_data {
+struct kvfree_rcu_bulk_data {
 	unsigned long nr_records;
-	struct kfree_rcu_bulk_data *next;
+	struct kvfree_rcu_bulk_data *next;
 	void *records[];
 };
 
 /*
  * This macro defines how many entries the "records" array
  * will contain. It is based on the fact that the size of
- * kfree_rcu_bulk_data structure becomes exactly one page.
+ * kvfree_rcu_bulk_data structure becomes exactly one page.
  */
-#define KFREE_BULK_MAX_ENTR \
-	((PAGE_SIZE - sizeof(struct kfree_rcu_bulk_data)) / sizeof(void *))
+#define KVFREE_BULK_MAX_ENTR \
+	((PAGE_SIZE - sizeof(struct kvfree_rcu_bulk_data)) / sizeof(void *))
 
 /**
  * struct kfree_rcu_cpu_work - single batch of kfree_rcu() requests
  * @rcu_work: Let queue_rcu_work() invoke workqueue handler after grace period
  * @head_free: List of kfree_rcu() objects waiting for a grace period
- * @bhead_free: Bulk-List of kfree_rcu() objects waiting for a grace period
+ * @bkvhead_free: Bulk-List of kvfree_rcu() objects waiting for a grace period
  * @krcp: Pointer to @kfree_rcu_cpu structure
  */
 
 struct kfree_rcu_cpu_work {
 	struct rcu_work rcu_work;
 	struct rcu_head *head_free;
-	struct kfree_rcu_bulk_data *bhead_free;
+	struct kvfree_rcu_bulk_data *bkvhead_free[FREE_N_CHANNELS];
 	struct kfree_rcu_cpu *krcp;
 };
 
 /**
  * struct kfree_rcu_cpu - batch up kfree_rcu() requests for RCU grace period
  * @head: List of kfree_rcu() objects not yet waiting for a grace period
- * @bhead: Bulk-List of kfree_rcu() objects not yet waiting for a grace period
+ * @bkvhead: Bulk-List of kvfree_rcu() objects not yet waiting for a grace period
  * @krw_arr: Array of batches of kfree_rcu() objects waiting for a grace period
  * @lock: Synchronize access to this structure
  * @monitor_work: Promote @head to @head_free after KFREE_DRAIN_JIFFIES
@@ -3020,7 +3023,7 @@ struct kfree_rcu_cpu_work {
  */
 struct kfree_rcu_cpu {
 	struct rcu_head *head;
-	struct kfree_rcu_bulk_data *bhead;
+	struct kvfree_rcu_bulk_data *bkvhead[FREE_N_CHANNELS];
 	struct kfree_rcu_cpu_work krw_arr[KFREE_N_BATCHES];
 	raw_spinlock_t lock;
 	struct delayed_work monitor_work;
@@ -3044,7 +3047,7 @@ static DEFINE_PER_CPU(struct kfree_rcu_cpu, krc) = {
 };
 
 static __always_inline void
-debug_rcu_bhead_unqueue(struct kfree_rcu_bulk_data *bhead)
+debug_rcu_bhead_unqueue(struct kvfree_rcu_bulk_data *bhead)
 {
 #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD
 	int i;
@@ -3073,20 +3076,20 @@ krc_this_cpu_unlock(struct kfree_rcu_cpu *krcp, unsigned long flags)
 	local_irq_restore(flags);
 }
 
-static inline struct kfree_rcu_bulk_data *
+static inline struct kvfree_rcu_bulk_data *
 get_cached_bnode(struct kfree_rcu_cpu *krcp)
 {
 	if (!krcp->nr_bkv_objs)
 		return NULL;
 
 	krcp->nr_bkv_objs--;
-	return (struct kfree_rcu_bulk_data *)
+	return (struct kvfree_rcu_bulk_data *)
 		llist_del_first(&krcp->bkvcache);
 }
 
 static inline bool
 put_cached_bnode(struct kfree_rcu_cpu *krcp,
-	struct kfree_rcu_bulk_data *bnode)
+	struct kvfree_rcu_bulk_data *bnode)
 {
 	// Check the limit.
 	if (krcp->nr_bkv_objs >= rcu_min_cached_objs)
@@ -3105,43 +3108,63 @@ put_cached_bnode(struct kfree_rcu_cpu *krcp,
 static void kfree_rcu_work(struct work_struct *work)
 {
 	unsigned long flags;
+	struct kvfree_rcu_bulk_data *bkvhead[FREE_N_CHANNELS], *bnext;
 	struct rcu_head *head, *next;
-	struct kfree_rcu_bulk_data *bhead, *bnext;
 	struct kfree_rcu_cpu *krcp;
 	struct kfree_rcu_cpu_work *krwp;
+	int i, j;
 
 	krwp = container_of(to_rcu_work(work),
 			    struct kfree_rcu_cpu_work, rcu_work);
 	krcp = krwp->krcp;
+
 	raw_spin_lock_irqsave(&krcp->lock, flags);
+	// Channels 1 and 2.
+	for (i = 0; i < FREE_N_CHANNELS; i++) {
+		bkvhead[i] = krwp->bkvhead_free[i];
+		krwp->bkvhead_free[i] = NULL;
+	}
+
+	// Channel 3.
 	head = krwp->head_free;
 	krwp->head_free = NULL;
-	bhead = krwp->bhead_free;
-	krwp->bhead_free = NULL;
 	raw_spin_unlock_irqrestore(&krcp->lock, flags);
 
-	/* "bhead" is now private, so traverse locklessly. */
-	for (; bhead; bhead = bnext) {
-		bnext = bhead->next;
+	// Handle two first channels.
+	for (i = 0; i < FREE_N_CHANNELS; i++) {
+		for (; bkvhead[i]; bkvhead[i] = bnext) {
+			bnext = bkvhead[i]->next;
+			debug_rcu_bhead_unqueue(bkvhead[i]);
 
-		debug_rcu_bhead_unqueue(bhead);
+			rcu_lock_acquire(&rcu_callback_map);
+			if (i == 0) { // kmalloc() / kfree().
+				trace_rcu_invoke_kfree_bulk_callback(
+					rcu_state.name, bkvhead[i]->nr_records,
+					bkvhead[i]->records);
 
-		rcu_lock_acquire(&rcu_callback_map);
-		trace_rcu_invoke_kfree_bulk_callback(rcu_state.name,
-			bhead->nr_records, bhead->records);
+				kfree_bulk(bkvhead[i]->nr_records,
+					bkvhead[i]->records);
+			} else { // vmalloc() / vfree().
+				for (j = 0; j < bkvhead[i]->nr_records; j++) {
+					trace_rcu_invoke_kfree_callback(
+						rcu_state.name,
+						bkvhead[i]->records[j], 0);
 
-		kfree_bulk(bhead->nr_records, bhead->records);
-		rcu_lock_release(&rcu_callback_map);
+					vfree(bkvhead[i]->records[j]);
+				}
+			}
+			rcu_lock_release(&rcu_callback_map);
 
-		krcp = krc_this_cpu_lock(&flags);
-		if (put_cached_bnode(krcp, bhead))
-			bhead = NULL;
-		krc_this_cpu_unlock(krcp, flags);
+			krcp = krc_this_cpu_lock(&flags);
+			if (put_cached_bnode(krcp, bkvhead[i]))
+				bkvhead[i] = NULL;
+			krc_this_cpu_unlock(krcp, flags);
 
-		if (bhead)
-			free_page((unsigned long) bhead);
+			if (bkvhead[i])
+				free_page((unsigned long) bkvhead[i]);
 
-		cond_resched_tasks_rcu_qs();
+			cond_resched_tasks_rcu_qs();
+		}
 	}
 
 	/*
@@ -3159,7 +3182,7 @@ static void kfree_rcu_work(struct work_struct *work)
 		trace_rcu_invoke_kfree_callback(rcu_state.name, head, offset);
 
 		if (!WARN_ON_ONCE(!__is_kfree_rcu_offset(offset)))
-			kfree(ptr);
+			kvfree(ptr);
 
 		rcu_lock_release(&rcu_callback_map);
 		cond_resched_tasks_rcu_qs();
@@ -3176,7 +3199,7 @@ static inline bool queue_kfree_rcu_work(struct kfree_rcu_cpu *krcp)
 {
 	struct kfree_rcu_cpu_work *krwp;
 	bool repeat = false;
-	int i;
+	int i, j;
 
 	lockdep_assert_held(&krcp->lock);
 
@@ -3184,21 +3207,25 @@ static inline bool queue_kfree_rcu_work(struct kfree_rcu_cpu *krcp)
 		krwp = &(krcp->krw_arr[i]);
 
 		/*
-		 * Try to detach bhead or head and attach it over any
+		 * Try to detach bkvhead or head and attach it over any
 		 * available corresponding free channel. It can be that
 		 * a previous RCU batch is in progress, it means that
 		 * immediately to queue another one is not possible so
 		 * return false to tell caller to retry.
 		 */
-		if ((krcp->bhead && !krwp->bhead_free) ||
+		if ((krcp->bkvhead[0] && !krwp->bkvhead_free[0]) ||
+			(krcp->bkvhead[1] && !krwp->bkvhead_free[1]) ||
 				(krcp->head && !krwp->head_free)) {
-			/* Channel 1. */
-			if (!krwp->bhead_free) {
-				krwp->bhead_free = krcp->bhead;
-				krcp->bhead = NULL;
+			// Channel 1 corresponds to SLAB ptrs.
+			// Channel 2 corresponds to vmalloc ptrs.
+			for (j = 0; j < FREE_N_CHANNELS; j++) {
+				if (!krwp->bkvhead_free[j]) {
+					krwp->bkvhead_free[j] = krcp->bkvhead[j];
+					krcp->bkvhead[j] = NULL;
+				}
 			}
 
-			/* Channel 2. */
+			// Channel 3 corresponds to emergency path.
 			if (!krwp->head_free) {
 				krwp->head_free = krcp->head;
 				krcp->head = NULL;
@@ -3207,16 +3234,17 @@ static inline bool queue_kfree_rcu_work(struct kfree_rcu_cpu *krcp)
 			WRITE_ONCE(krcp->count, 0);
 
 			/*
-			 * One work is per one batch, so there are two "free channels",
-			 * "bhead_free" and "head_free" the batch can handle. It can be
-			 * that the work is in the pending state when two channels have
-			 * been detached following each other, one by one.
+			 * One work is per one batch, so there are three
+			 * "free channels", the batch can handle. It can
+			 * be that the work is in the pending state when
+			 * channels have been detached following by each
+			 * other.
 			 */
 			queue_rcu_work(system_wq, &krwp->rcu_work);
 		}
 
-		/* Repeat if any "free" corresponding channel is still busy. */
-		if (krcp->bhead || krcp->head)
+		// Repeat if any "free" corresponding channel is still busy.
+		if (krcp->bkvhead[0] || krcp->bkvhead[1] || krcp->head)
 			repeat = true;
 	}
 
@@ -3258,23 +3286,22 @@ static void kfree_rcu_monitor(struct work_struct *work)
 }
 
 static inline bool
-kfree_call_rcu_add_ptr_to_bulk(struct kfree_rcu_cpu *krcp,
-	struct rcu_head *head, rcu_callback_t func)
+kvfree_call_rcu_add_ptr_to_bulk(struct kfree_rcu_cpu *krcp, void *ptr)
 {
-	struct kfree_rcu_bulk_data *bnode;
+	struct kvfree_rcu_bulk_data *bnode;
+	int idx;
 
 	if (unlikely(!krcp->initialized))
 		return false;
 
 	lockdep_assert_held(&krcp->lock);
+	idx = !!is_vmalloc_addr(ptr);
 
 	/* Check if a new block is required. */
-	if (!krcp->bhead ||
-			krcp->bhead->nr_records == KFREE_BULK_MAX_ENTR) {
+	if (!krcp->bkvhead[idx] ||
+			krcp->bkvhead[idx]->nr_records == KVFREE_BULK_MAX_ENTR) {
 		bnode = get_cached_bnode(krcp);
 		if (!bnode) {
-			WARN_ON_ONCE(sizeof(struct kfree_rcu_bulk_data) > PAGE_SIZE);
-
 			/*
 			 * To keep this path working on raw non-preemptible
 			 * sections, prevent the optional entry into the
@@ -3287,7 +3314,7 @@ kfree_call_rcu_add_ptr_to_bulk(struct kfree_rcu_cpu *krcp,
 			if (IS_ENABLED(CONFIG_PREEMPT_RT))
 				return false;
 
-			bnode = (struct kfree_rcu_bulk_data *)
+			bnode = (struct kvfree_rcu_bulk_data *)
 				__get_free_page(GFP_NOWAIT | __GFP_NOWARN);
 		}
 
@@ -3297,30 +3324,30 @@ kfree_call_rcu_add_ptr_to_bulk(struct kfree_rcu_cpu *krcp,
 
 		/* Initialize the new block. */
 		bnode->nr_records = 0;
-		bnode->next = krcp->bhead;
+		bnode->next = krcp->bkvhead[idx];
 
 		/* Attach it to the head. */
-		krcp->bhead = bnode;
+		krcp->bkvhead[idx] = bnode;
 	}
 
 	/* Finally insert. */
-	krcp->bhead->records[krcp->bhead->nr_records++] =
-		(void *) head - (unsigned long) func;
+	krcp->bkvhead[idx]->records
+		[krcp->bkvhead[idx]->nr_records++] = ptr;
 
 	return true;
 }
 
 /*
- * Queue a request for lazy invocation of kfree_bulk()/kfree() after a grace
- * period. Please note there are two paths are maintained, one is the main one
- * that uses kfree_bulk() interface and second one is emergency one, that is
- * used only when the main path can not be maintained temporary, due to memory
- * pressure.
+ * Queue a request for lazy invocation of appropriate free routine after a
+ * grace period. Please note there are three paths are maintained, two are the
+ * main ones that use array of pointers interface and third one is emergency
+ * one, that is used only when the main path can not be maintained temporary,
+ * due to memory pressure.
  *
  * Each kfree_call_rcu() request is added to a batch. The batch will be drained
  * every KFREE_DRAIN_JIFFIES number of jiffies. All the objects in the batch will
  * be free'd in workqueue context. This allows us to: batch requests together to
- * reduce the number of grace periods during heavy kfree_rcu() load.
+ * reduce the number of grace periods during heavy kfree_rcu()/kvfree_rcu() load.
  */
 void kfree_call_rcu(struct rcu_head *head, rcu_callback_t func)
 {
@@ -3343,7 +3370,7 @@ void kfree_call_rcu(struct rcu_head *head, rcu_callback_t func)
 	 * Under high memory pressure GFP_NOWAIT can fail,
 	 * in that case the emergency path is maintained.
 	 */
-	if (unlikely(!kfree_call_rcu_add_ptr_to_bulk(krcp, head, func))) {
+	if (unlikely(!kvfree_call_rcu_add_ptr_to_bulk(krcp, ptr))) {
 		head->func = func;
 		head->next = krcp->head;
 		krcp->head = head;
@@ -4324,7 +4351,7 @@ static void __init kfree_rcu_batch_init(void)
 
 	for_each_possible_cpu(cpu) {
 		struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu);
-		struct kfree_rcu_bulk_data *bnode;
+		struct kvfree_rcu_bulk_data *bnode;
 
 		for (i = 0; i < KFREE_N_BATCHES; i++) {
 			INIT_RCU_WORK(&krcp->krw_arr[i].rcu_work, kfree_rcu_work);
@@ -4332,7 +4359,7 @@ static void __init kfree_rcu_batch_init(void)
 		}
 
 		for (i = 0; i < rcu_min_cached_objs; i++) {
-			bnode = (struct kfree_rcu_bulk_data *)
+			bnode = (struct kvfree_rcu_bulk_data *)
 				__get_free_page(GFP_NOWAIT | __GFP_NOWARN);
 
 			if (bnode)

From 64d1d06ccb1b7de245ccf781b91517f328bebd9f Mon Sep 17 00:00:00 2001
From: "Uladzislau Rezki (Sony)" <urezki@gmail.com>
Date: Mon, 25 May 2020 23:47:54 +0200
Subject: [PATCH 120/502] rcu/tiny: support vmalloc in tiny-RCU

Replace kfree() with kvfree() in rcu_reclaim_tiny().
This makes it possible to release either SLAB or vmalloc
objects after a GP.

Reviewed-by: Joel Fernandes (Google) <joel@joelfernandes.org>
Signed-off-by: Uladzislau Rezki (Sony) <urezki@gmail.com>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 kernel/rcu/tiny.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/kernel/rcu/tiny.c b/kernel/rcu/tiny.c
index dd572ce7c747..4b99f7b88bee 100644
--- a/kernel/rcu/tiny.c
+++ b/kernel/rcu/tiny.c
@@ -23,6 +23,7 @@
 #include <linux/cpu.h>
 #include <linux/prefetch.h>
 #include <linux/slab.h>
+#include <linux/mm.h>
 
 #include "rcu.h"
 
@@ -86,7 +87,7 @@ static inline bool rcu_reclaim_tiny(struct rcu_head *head)
 	rcu_lock_acquire(&rcu_callback_map);
 	if (__is_kfree_rcu_offset(offset)) {
 		trace_rcu_invoke_kfree_callback("", head, offset);
-		kfree((void *)head - offset);
+		kvfree((void *)head - offset);
 		rcu_lock_release(&rcu_callback_map);
 		return true;
 	}

From c408b215f58f7156bb6bafb64c0263ee907033df Mon Sep 17 00:00:00 2001
From: "Uladzislau Rezki (Sony)" <urezki@gmail.com>
Date: Mon, 25 May 2020 23:47:55 +0200
Subject: [PATCH 121/502] rcu: Rename
 *_kfree_callback/*_kfree_rcu_offset/kfree_call_*

The following changes are introduced:

1. Rename rcu_invoke_kfree_callback() to rcu_invoke_kvfree_callback(),
as well as the associated trace events, so the rcu_kfree_callback(),
becomes rcu_kvfree_callback(). The reason is to be aligned with kvfree()
notation.

2. Rename __is_kfree_rcu_offset to __is_kvfree_rcu_offset. All RCU
paths use kvfree() now instead of kfree(), thus rename it.

3. Rename kfree_call_rcu() to the kvfree_call_rcu(). The reason is,
it is capable of freeing vmalloc() memory now. Do the same with
__kfree_rcu() macro, it becomes __kvfree_rcu(), the goal is the
same.

Reviewed-by: Joel Fernandes (Google) <joel@joelfernandes.org>
Co-developed-by: Joel Fernandes (Google) <joel@joelfernandes.org>
Signed-off-by: Joel Fernandes (Google) <joel@joelfernandes.org>
Signed-off-by: Uladzislau Rezki (Sony) <urezki@gmail.com>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 include/linux/rcupdate.h   | 14 +++++++-------
 include/linux/rcutiny.h    |  2 +-
 include/linux/rcutree.h    |  2 +-
 include/trace/events/rcu.h |  8 ++++----
 kernel/rcu/tiny.c          |  4 ++--
 kernel/rcu/tree.c          | 16 ++++++++--------
 6 files changed, 23 insertions(+), 23 deletions(-)

diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
index 659cbfa7581a..b344fc800a9b 100644
--- a/include/linux/rcupdate.h
+++ b/include/linux/rcupdate.h
@@ -828,17 +828,17 @@ static inline notrace void rcu_read_unlock_sched_notrace(void)
 
 /*
  * Does the specified offset indicate that the corresponding rcu_head
- * structure can be handled by kfree_rcu()?
+ * structure can be handled by kvfree_rcu()?
  */
-#define __is_kfree_rcu_offset(offset) ((offset) < 4096)
+#define __is_kvfree_rcu_offset(offset) ((offset) < 4096)
 
 /*
  * Helper macro for kfree_rcu() to prevent argument-expansion eyestrain.
  */
-#define __kfree_rcu(head, offset) \
+#define __kvfree_rcu(head, offset) \
 	do { \
-		BUILD_BUG_ON(!__is_kfree_rcu_offset(offset)); \
-		kfree_call_rcu(head, (rcu_callback_t)(unsigned long)(offset)); \
+		BUILD_BUG_ON(!__is_kvfree_rcu_offset(offset)); \
+		kvfree_call_rcu(head, (rcu_callback_t)(unsigned long)(offset)); \
 	} while (0)
 
 /**
@@ -857,7 +857,7 @@ static inline notrace void rcu_read_unlock_sched_notrace(void)
  * Because the functions are not allowed in the low-order 4096 bytes of
  * kernel virtual memory, offsets up to 4095 bytes can be accommodated.
  * If the offset is larger than 4095 bytes, a compile-time error will
- * be generated in __kfree_rcu().  If this error is triggered, you can
+ * be generated in __kvfree_rcu(). If this error is triggered, you can
  * either fall back to use of call_rcu() or rearrange the structure to
  * position the rcu_head structure into the first 4096 bytes.
  *
@@ -872,7 +872,7 @@ do {									\
 	typeof (ptr) ___p = (ptr);					\
 									\
 	if (___p)							\
-		__kfree_rcu(&((___p)->rhf), offsetof(typeof(*(ptr)), rhf)); \
+		__kvfree_rcu(&((___p)->rhf), offsetof(typeof(*(ptr)), rhf)); \
 } while (0)
 
 /*
diff --git a/include/linux/rcutiny.h b/include/linux/rcutiny.h
index 8512caeb7682..fb2eb39c484f 100644
--- a/include/linux/rcutiny.h
+++ b/include/linux/rcutiny.h
@@ -34,7 +34,7 @@ static inline void synchronize_rcu_expedited(void)
 	synchronize_rcu();
 }
 
-static inline void kfree_call_rcu(struct rcu_head *head, rcu_callback_t func)
+static inline void kvfree_call_rcu(struct rcu_head *head, rcu_callback_t func)
 {
 	call_rcu(head, func);
 }
diff --git a/include/linux/rcutree.h b/include/linux/rcutree.h
index d5cc9d675987..d2f4064ebd1d 100644
--- a/include/linux/rcutree.h
+++ b/include/linux/rcutree.h
@@ -33,7 +33,7 @@ static inline void rcu_virt_note_context_switch(int cpu)
 }
 
 void synchronize_rcu_expedited(void);
-void kfree_call_rcu(struct rcu_head *head, rcu_callback_t func);
+void kvfree_call_rcu(struct rcu_head *head, rcu_callback_t func);
 
 void rcu_barrier(void);
 bool rcu_eqs_special_set(int cpu);
diff --git a/include/trace/events/rcu.h b/include/trace/events/rcu.h
index f9a7811148e2..0ee93d0b1daa 100644
--- a/include/trace/events/rcu.h
+++ b/include/trace/events/rcu.h
@@ -506,13 +506,13 @@ TRACE_EVENT_RCU(rcu_callback,
 
 /*
  * Tracepoint for the registration of a single RCU callback of the special
- * kfree() form.  The first argument is the RCU type, the second argument
+ * kvfree() form.  The first argument is the RCU type, the second argument
  * is a pointer to the RCU callback, the third argument is the offset
  * of the callback within the enclosing RCU-protected data structure,
  * the fourth argument is the number of lazy callbacks queued, and the
  * fifth argument is the total number of callbacks queued.
  */
-TRACE_EVENT_RCU(rcu_kfree_callback,
+TRACE_EVENT_RCU(rcu_kvfree_callback,
 
 	TP_PROTO(const char *rcuname, struct rcu_head *rhp, unsigned long offset,
 		 long qlen),
@@ -596,12 +596,12 @@ TRACE_EVENT_RCU(rcu_invoke_callback,
 
 /*
  * Tracepoint for the invocation of a single RCU callback of the special
- * kfree() form.  The first argument is the RCU flavor, the second
+ * kvfree() form.  The first argument is the RCU flavor, the second
  * argument is a pointer to the RCU callback, and the third argument
  * is the offset of the callback within the enclosing RCU-protected
  * data structure.
  */
-TRACE_EVENT_RCU(rcu_invoke_kfree_callback,
+TRACE_EVENT_RCU(rcu_invoke_kvfree_callback,
 
 	TP_PROTO(const char *rcuname, struct rcu_head *rhp, unsigned long offset),
 
diff --git a/kernel/rcu/tiny.c b/kernel/rcu/tiny.c
index 4b99f7b88bee..aa897c3f2e92 100644
--- a/kernel/rcu/tiny.c
+++ b/kernel/rcu/tiny.c
@@ -85,8 +85,8 @@ static inline bool rcu_reclaim_tiny(struct rcu_head *head)
 	unsigned long offset = (unsigned long)head->func;
 
 	rcu_lock_acquire(&rcu_callback_map);
-	if (__is_kfree_rcu_offset(offset)) {
-		trace_rcu_invoke_kfree_callback("", head, offset);
+	if (__is_kvfree_rcu_offset(offset)) {
+		trace_rcu_invoke_kvfree_callback("", head, offset);
 		kvfree((void *)head - offset);
 		rcu_lock_release(&rcu_callback_map);
 		return true;
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 67c4b984c499..f22c47e72287 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -2905,8 +2905,8 @@ __call_rcu(struct rcu_head *head, rcu_callback_t func)
 		return; // Enqueued onto ->nocb_bypass, so just leave.
 	// If no-CBs CPU gets here, rcu_nocb_try_bypass() acquired ->nocb_lock.
 	rcu_segcblist_enqueue(&rdp->cblist, head);
-	if (__is_kfree_rcu_offset((unsigned long)func))
-		trace_rcu_kfree_callback(rcu_state.name, head,
+	if (__is_kvfree_rcu_offset((unsigned long)func))
+		trace_rcu_kvfree_callback(rcu_state.name, head,
 					 (unsigned long)func,
 					 rcu_segcblist_n_cbs(&rdp->cblist));
 	else
@@ -3146,7 +3146,7 @@ static void kfree_rcu_work(struct work_struct *work)
 					bkvhead[i]->records);
 			} else { // vmalloc() / vfree().
 				for (j = 0; j < bkvhead[i]->nr_records; j++) {
-					trace_rcu_invoke_kfree_callback(
+					trace_rcu_invoke_kvfree_callback(
 						rcu_state.name,
 						bkvhead[i]->records[j], 0);
 
@@ -3179,9 +3179,9 @@ static void kfree_rcu_work(struct work_struct *work)
 		next = head->next;
 		debug_rcu_head_unqueue((struct rcu_head *)ptr);
 		rcu_lock_acquire(&rcu_callback_map);
-		trace_rcu_invoke_kfree_callback(rcu_state.name, head, offset);
+		trace_rcu_invoke_kvfree_callback(rcu_state.name, head, offset);
 
-		if (!WARN_ON_ONCE(!__is_kfree_rcu_offset(offset)))
+		if (!WARN_ON_ONCE(!__is_kvfree_rcu_offset(offset)))
 			kvfree(ptr);
 
 		rcu_lock_release(&rcu_callback_map);
@@ -3344,12 +3344,12 @@ kvfree_call_rcu_add_ptr_to_bulk(struct kfree_rcu_cpu *krcp, void *ptr)
  * one, that is used only when the main path can not be maintained temporary,
  * due to memory pressure.
  *
- * Each kfree_call_rcu() request is added to a batch. The batch will be drained
+ * Each kvfree_call_rcu() request is added to a batch. The batch will be drained
  * every KFREE_DRAIN_JIFFIES number of jiffies. All the objects in the batch will
  * be free'd in workqueue context. This allows us to: batch requests together to
  * reduce the number of grace periods during heavy kfree_rcu()/kvfree_rcu() load.
  */
-void kfree_call_rcu(struct rcu_head *head, rcu_callback_t func)
+void kvfree_call_rcu(struct rcu_head *head, rcu_callback_t func)
 {
 	unsigned long flags;
 	struct kfree_rcu_cpu *krcp;
@@ -3388,7 +3388,7 @@ void kfree_call_rcu(struct rcu_head *head, rcu_callback_t func)
 unlock_return:
 	krc_this_cpu_unlock(krcp, flags);
 }
-EXPORT_SYMBOL_GPL(kfree_call_rcu);
+EXPORT_SYMBOL_GPL(kvfree_call_rcu);
 
 static unsigned long
 kfree_rcu_shrink_count(struct shrinker *shrink, struct shrink_control *sc)

From e0feed08ab41df0fedc38d35938891ef5715c1d3 Mon Sep 17 00:00:00 2001
From: "Uladzislau Rezki (Sony)" <urezki@gmail.com>
Date: Mon, 25 May 2020 23:47:56 +0200
Subject: [PATCH 122/502] mm/list_lru.c: Rename kvfree_rcu() to local variant

Rename kvfree_rcu() function to the kvfree_rcu_local() one.
The purpose is to prevent a conflict of two same function
declarations. The kvfree_rcu() will be globally visible
what would lead to a build error. No functional change.

Cc: linux-mm@kvack.org
Cc: rcu@vger.kernel.org
Cc: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Uladzislau Rezki (Sony) <urezki@gmail.com>
Reviewed-by: Joel Fernandes (Google) <joel@joelfernandes.org>
Signed-off-by: Joel Fernandes (Google) <joel@joelfernandes.org>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 mm/list_lru.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/mm/list_lru.c b/mm/list_lru.c
index 9222910ab1cb..e825804b3928 100644
--- a/mm/list_lru.c
+++ b/mm/list_lru.c
@@ -373,14 +373,14 @@ static void memcg_destroy_list_lru_node(struct list_lru_node *nlru)
 	struct list_lru_memcg *memcg_lrus;
 	/*
 	 * This is called when shrinker has already been unregistered,
-	 * and nobody can use it. So, there is no need to use kvfree_rcu().
+	 * and nobody can use it. So, there is no need to use kvfree_rcu_local().
 	 */
 	memcg_lrus = rcu_dereference_protected(nlru->memcg_lrus, true);
 	__memcg_destroy_list_lru_node(memcg_lrus, 0, memcg_nr_cache_ids);
 	kvfree(memcg_lrus);
 }
 
-static void kvfree_rcu(struct rcu_head *head)
+static void kvfree_rcu_local(struct rcu_head *head)
 {
 	struct list_lru_memcg *mlru;
 
@@ -419,7 +419,7 @@ static int memcg_update_list_lru_node(struct list_lru_node *nlru,
 	rcu_assign_pointer(nlru->memcg_lrus, new);
 	spin_unlock_irq(&nlru->lock);
 
-	call_rcu(&old->rcu, kvfree_rcu);
+	call_rcu(&old->rcu, kvfree_rcu_local);
 	return 0;
 }
 

From ce4dce123fdcb5f209752d13f9f06926be65fc78 Mon Sep 17 00:00:00 2001
From: "Uladzislau Rezki (Sony)" <urezki@gmail.com>
Date: Mon, 25 May 2020 23:47:57 +0200
Subject: [PATCH 123/502] rcu: Introduce 2 arg kvfree_rcu() interface

kvmalloc() can allocate two types of objects: SLAB backed
and vmalloc backed. How it behaves depends on requested
object's size and memory pressure.

Add a kvfree_rcu() interface that can free memory allocated
via kvmalloc(). It is a simple alias to kfree_rcu() which
can now handle either type of object.

<snip>
    struct test_kvfree_rcu {
        struct rcu_head rcu;
        unsigned char array[100];
    };

    struct test_kvfree_rcu *p;

    p = kvmalloc(10 * PAGE_SIZE);
    if (p)
        kvfree_rcu(p, rcu);
<snip>

Signed-off-by: Uladzislau Rezki (Sony) <urezki@gmail.com>
Co-developed-by: Joel Fernandes (Google) <joel@joelfernandes.org>
Reviewed-by: Joel Fernandes (Google) <joel@joelfernandes.org>
Signed-off-by: Joel Fernandes (Google) <joel@joelfernandes.org>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 include/linux/rcupdate.h | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
index b344fc800a9b..51b26ab02878 100644
--- a/include/linux/rcupdate.h
+++ b/include/linux/rcupdate.h
@@ -875,6 +875,15 @@ do {									\
 		__kvfree_rcu(&((___p)->rhf), offsetof(typeof(*(ptr)), rhf)); \
 } while (0)
 
+/**
+ * kvfree_rcu() - kvfree an object after a grace period.
+ * @ptr:	pointer to kvfree
+ * @rhf:	the name of the struct rcu_head within the type of @ptr.
+ *
+ * Same as kfree_rcu(), just simple alias.
+ */
+#define kvfree_rcu(ptr, rhf) kfree_rcu(ptr, rhf)
+
 /*
  * Place this after a lock-acquisition primitive to guarantee that
  * an UNLOCK+LOCK pair acts as a full barrier.  This guarantee applies

From 3042f83f19bec2e0cd356f72b39e4d816e8cd5ff Mon Sep 17 00:00:00 2001
From: "Uladzislau Rezki (Sony)" <urezki@gmail.com>
Date: Mon, 25 May 2020 23:47:58 +0200
Subject: [PATCH 124/502] rcu: Support reclaim for head-less object

Update the kvfree_call_rcu() function with head-less support.
This allows RCU to reclaim objects without an embedded rcu_head.

tree-RCU:
We introduce two chains of arrays to store SLAB-backed and vmalloc
pointers, each.  Storage in either of these arrays does not require
embedding an rcu_head within the object.

Maintaining the arrays may become impossible due to high memory
pressure. For such cases there is an emergency path. Objects with
rcu_head inside are just queued on a backup rcu_head list. Later on
that list is drained. As for the head-less variant, as the current
context can sleep, the following emergency measures are applied:
   a) Synchronously wait until a grace period has elapsed.
   b) Call kvfree().

tiny-RCU:
For double argument calls, there are no new changes in behavior. For
single argument call, kvfree() is directly inlined on the current
stack after a synchronize_rcu() call. Note that for tiny-RCU, any
call to synchronize_rcu() is actually a quiescent state, therefore
it does nothing.

Reviewed-by: Joel Fernandes (Google) <joel@joelfernandes.org>
Signed-off-by: Uladzislau Rezki (Sony) <urezki@gmail.com>
Signed-off-by: Joel Fernandes (Google) <joel@joelfernandes.org>
Co-developed-by: Joel Fernandes (Google) <joel@joelfernandes.org>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 include/linux/rcutiny.h | 18 ++++++++++++++++-
 kernel/rcu/tree.c       | 45 +++++++++++++++++++++++++++++++++++++++--
 2 files changed, 60 insertions(+), 3 deletions(-)

diff --git a/include/linux/rcutiny.h b/include/linux/rcutiny.h
index fb2eb39c484f..5cc9637cac16 100644
--- a/include/linux/rcutiny.h
+++ b/include/linux/rcutiny.h
@@ -34,9 +34,25 @@ static inline void synchronize_rcu_expedited(void)
 	synchronize_rcu();
 }
 
+/*
+ * Add one more declaration of kvfree() here. It is
+ * not so straight forward to just include <linux/mm.h>
+ * where it is defined due to getting many compile
+ * errors caused by that include.
+ */
+extern void kvfree(const void *addr);
+
 static inline void kvfree_call_rcu(struct rcu_head *head, rcu_callback_t func)
 {
-	call_rcu(head, func);
+	if (head) {
+		call_rcu(head, func);
+		return;
+	}
+
+	// kvfree_rcu(one_arg) call.
+	might_sleep();
+	synchronize_rcu();
+	kvfree((void *) func);
 }
 
 void rcu_qs(void);
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index f22c47e72287..01f29e4500ba 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -3314,6 +3314,13 @@ kvfree_call_rcu_add_ptr_to_bulk(struct kfree_rcu_cpu *krcp, void *ptr)
 			if (IS_ENABLED(CONFIG_PREEMPT_RT))
 				return false;
 
+			/*
+			 * NOTE: For one argument of kvfree_rcu() we can
+			 * drop the lock and get the page in sleepable
+			 * context. That would allow to maintain an array
+			 * for the CONFIG_PREEMPT_RT as well if no cached
+			 * pages are available.
+			 */
 			bnode = (struct kvfree_rcu_bulk_data *)
 				__get_free_page(GFP_NOWAIT | __GFP_NOWARN);
 		}
@@ -3353,16 +3360,33 @@ void kvfree_call_rcu(struct rcu_head *head, rcu_callback_t func)
 {
 	unsigned long flags;
 	struct kfree_rcu_cpu *krcp;
+	bool success;
 	void *ptr;
 
+	if (head) {
+		ptr = (void *) head - (unsigned long) func;
+	} else {
+		/*
+		 * Please note there is a limitation for the head-less
+		 * variant, that is why there is a clear rule for such
+		 * objects: it can be used from might_sleep() context
+		 * only. For other places please embed an rcu_head to
+		 * your data.
+		 */
+		might_sleep();
+		ptr = (unsigned long *) func;
+	}
+
 	krcp = krc_this_cpu_lock(&flags);
-	ptr = (void *)head - (unsigned long)func;
 
 	// Queue the object but don't yet schedule the batch.
 	if (debug_rcu_head_queue(ptr)) {
 		// Probable double kfree_rcu(), just leak.
 		WARN_ONCE(1, "%s(): Double-freed call. rcu_head %p\n",
 			  __func__, head);
+
+		// Mark as success and leave.
+		success = true;
 		goto unlock_return;
 	}
 
@@ -3370,10 +3394,16 @@ void kvfree_call_rcu(struct rcu_head *head, rcu_callback_t func)
 	 * Under high memory pressure GFP_NOWAIT can fail,
 	 * in that case the emergency path is maintained.
 	 */
-	if (unlikely(!kvfree_call_rcu_add_ptr_to_bulk(krcp, ptr))) {
+	success = kvfree_call_rcu_add_ptr_to_bulk(krcp, ptr);
+	if (!success) {
+		if (head == NULL)
+			// Inline if kvfree_rcu(one_arg) call.
+			goto unlock_return;
+
 		head->func = func;
 		head->next = krcp->head;
 		krcp->head = head;
+		success = true;
 	}
 
 	WRITE_ONCE(krcp->count, krcp->count + 1);
@@ -3387,6 +3417,17 @@ void kvfree_call_rcu(struct rcu_head *head, rcu_callback_t func)
 
 unlock_return:
 	krc_this_cpu_unlock(krcp, flags);
+
+	/*
+	 * Inline kvfree() after synchronize_rcu(). We can do
+	 * it from might_sleep() context only, so the current
+	 * CPU can pass the QS state.
+	 */
+	if (!success) {
+		debug_rcu_head_unqueue((struct rcu_head *) ptr);
+		synchronize_rcu();
+		kvfree(ptr);
+	}
 }
 EXPORT_SYMBOL_GPL(kvfree_call_rcu);
 

From 1835f475e3518ade61e25a57572c78b953778656 Mon Sep 17 00:00:00 2001
From: "Uladzislau Rezki (Sony)" <urezki@gmail.com>
Date: Mon, 25 May 2020 23:47:59 +0200
Subject: [PATCH 125/502] rcu: Introduce single argument kvfree_rcu() interface

Make kvfree_rcu() capable of freeing objects that will not
embed an rcu_head within it. This saves storage overhead in
such objects. Reclaiming headless objects this way requires
only a single argument (pointer to the object).

After this patch, there are two ways to use kvfree_rcu():

a) kvfree_rcu(ptr, rhf);
    struct X {
        struct rcu_head rhf;
        unsigned char data[100];
    };

    void *ptr = kvmalloc(sizeof(struct X), GFP_KERNEL);
    if (ptr)
        kvfree_rcu(ptr, rhf);

b) kvfree_rcu(ptr);
    void *ptr = kvmalloc(some_bytes, GFP_KERNEL);
    if (ptr)
        kvfree_rcu(ptr);

Note that the headless usage (example b) can only be used in a code
that can sleep. This is enforced by the CONFIG_DEBUG_ATOMIC_SLEEP
option.

Co-developed-by: Joel Fernandes (Google) <joel@joelfernandes.org>
Reviewed-by: Joel Fernandes (Google) <joel@joelfernandes.org>
Signed-off-by: Uladzislau Rezki (Sony) <urezki@gmail.com>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 include/linux/rcupdate.h | 38 ++++++++++++++++++++++++++++++++++----
 1 file changed, 34 insertions(+), 4 deletions(-)

diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
index 51b26ab02878..d15d46db61f7 100644
--- a/include/linux/rcupdate.h
+++ b/include/linux/rcupdate.h
@@ -877,12 +877,42 @@ do {									\
 
 /**
  * kvfree_rcu() - kvfree an object after a grace period.
- * @ptr:	pointer to kvfree
- * @rhf:	the name of the struct rcu_head within the type of @ptr.
  *
- * Same as kfree_rcu(), just simple alias.
+ * This macro consists of one or two arguments and it is
+ * based on whether an object is head-less or not. If it
+ * has a head then a semantic stays the same as it used
+ * to be before:
+ *
+ *     kvfree_rcu(ptr, rhf);
+ *
+ * where @ptr is a pointer to kvfree(), @rhf is the name
+ * of the rcu_head structure within the type of @ptr.
+ *
+ * When it comes to head-less variant, only one argument
+ * is passed and that is just a pointer which has to be
+ * freed after a grace period. Therefore the semantic is
+ *
+ *     kvfree_rcu(ptr);
+ *
+ * where @ptr is a pointer to kvfree().
+ *
+ * Please note, head-less way of freeing is permitted to
+ * use from a context that has to follow might_sleep()
+ * annotation. Otherwise, please switch and embed the
+ * rcu_head structure within the type of @ptr.
  */
-#define kvfree_rcu(ptr, rhf) kfree_rcu(ptr, rhf)
+#define kvfree_rcu(...) KVFREE_GET_MACRO(__VA_ARGS__,		\
+	kvfree_rcu_arg_2, kvfree_rcu_arg_1)(__VA_ARGS__)
+
+#define KVFREE_GET_MACRO(_1, _2, NAME, ...) NAME
+#define kvfree_rcu_arg_2(ptr, rhf) kfree_rcu(ptr, rhf)
+#define kvfree_rcu_arg_1(ptr)					\
+do {								\
+	typeof(ptr) ___p = (ptr);				\
+								\
+	if (___p)						\
+		kvfree_call_rcu(NULL, (rcu_callback_t) (___p));	\
+} while (0)
 
 /*
  * Place this after a lock-acquisition primitive to guarantee that

From da4fc00abb97ce1269b0940abe86e25456e28424 Mon Sep 17 00:00:00 2001
From: "Uladzislau Rezki (Sony)" <urezki@gmail.com>
Date: Mon, 25 May 2020 23:48:00 +0200
Subject: [PATCH 126/502] lib/test_vmalloc.c: Add test cases for kvfree_rcu()

Introduce four new test cases for testing the kvfree_rcu()
interface. Two of them belong to single argument functionality
and another two for 2-argument functionality.

The aim is to stress and check how kvfree_rcu() behaves under
different load and memory conditions and analyze its performance
throughput.

Reviewed-by: Joel Fernandes (Google) <joel@joelfernandes.org>
Signed-off-by: Uladzislau Rezki (Sony) <urezki@gmail.com>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 lib/test_vmalloc.c | 103 +++++++++++++++++++++++++++++++++++++++++----
 1 file changed, 95 insertions(+), 8 deletions(-)

diff --git a/lib/test_vmalloc.c b/lib/test_vmalloc.c
index ddc9685702b1..5cf2fe9aab9e 100644
--- a/lib/test_vmalloc.c
+++ b/lib/test_vmalloc.c
@@ -15,6 +15,8 @@
 #include <linux/delay.h>
 #include <linux/rwsem.h>
 #include <linux/mm.h>
+#include <linux/rcupdate.h>
+#include <linux/slab.h>
 
 #define __param(type, name, init, msg)		\
 	static type name = init;				\
@@ -35,14 +37,18 @@ __param(int, test_loop_count, 1000000,
 
 __param(int, run_test_mask, INT_MAX,
 	"Set tests specified in the mask.\n\n"
-		"\t\tid: 1,   name: fix_size_alloc_test\n"
-		"\t\tid: 2,   name: full_fit_alloc_test\n"
-		"\t\tid: 4,   name: long_busy_list_alloc_test\n"
-		"\t\tid: 8,   name: random_size_alloc_test\n"
-		"\t\tid: 16,  name: fix_align_alloc_test\n"
-		"\t\tid: 32,  name: random_size_align_alloc_test\n"
-		"\t\tid: 64,  name: align_shift_alloc_test\n"
-		"\t\tid: 128, name: pcpu_alloc_test\n"
+		"\t\tid: 1,    name: fix_size_alloc_test\n"
+		"\t\tid: 2,    name: full_fit_alloc_test\n"
+		"\t\tid: 4,    name: long_busy_list_alloc_test\n"
+		"\t\tid: 8,    name: random_size_alloc_test\n"
+		"\t\tid: 16,   name: fix_align_alloc_test\n"
+		"\t\tid: 32,   name: random_size_align_alloc_test\n"
+		"\t\tid: 64,   name: align_shift_alloc_test\n"
+		"\t\tid: 128,  name: pcpu_alloc_test\n"
+		"\t\tid: 256,  name: kvfree_rcu_1_arg_vmalloc_test\n"
+		"\t\tid: 512,  name: kvfree_rcu_2_arg_vmalloc_test\n"
+		"\t\tid: 1024, name: kvfree_rcu_1_arg_slab_test\n"
+		"\t\tid: 2048, name: kvfree_rcu_2_arg_slab_test\n"
 		/* Add a new test case description here. */
 );
 
@@ -316,6 +322,83 @@ pcpu_alloc_test(void)
 	return rv;
 }
 
+struct test_kvfree_rcu {
+	struct rcu_head rcu;
+	unsigned char array[20];
+};
+
+static int
+kvfree_rcu_1_arg_vmalloc_test(void)
+{
+	struct test_kvfree_rcu *p;
+	int i;
+
+	for (i = 0; i < test_loop_count; i++) {
+		p = vmalloc(1 * PAGE_SIZE);
+		if (!p)
+			return -1;
+
+		p->array[0] = 'a';
+		kvfree_rcu(p);
+	}
+
+	return 0;
+}
+
+static int
+kvfree_rcu_2_arg_vmalloc_test(void)
+{
+	struct test_kvfree_rcu *p;
+	int i;
+
+	for (i = 0; i < test_loop_count; i++) {
+		p = vmalloc(1 * PAGE_SIZE);
+		if (!p)
+			return -1;
+
+		p->array[0] = 'a';
+		kvfree_rcu(p, rcu);
+	}
+
+	return 0;
+}
+
+static int
+kvfree_rcu_1_arg_slab_test(void)
+{
+	struct test_kvfree_rcu *p;
+	int i;
+
+	for (i = 0; i < test_loop_count; i++) {
+		p = kmalloc(sizeof(*p), GFP_KERNEL);
+		if (!p)
+			return -1;
+
+		p->array[0] = 'a';
+		kvfree_rcu(p);
+	}
+
+	return 0;
+}
+
+static int
+kvfree_rcu_2_arg_slab_test(void)
+{
+	struct test_kvfree_rcu *p;
+	int i;
+
+	for (i = 0; i < test_loop_count; i++) {
+		p = kmalloc(sizeof(*p), GFP_KERNEL);
+		if (!p)
+			return -1;
+
+		p->array[0] = 'a';
+		kvfree_rcu(p, rcu);
+	}
+
+	return 0;
+}
+
 struct test_case_desc {
 	const char *test_name;
 	int (*test_func)(void);
@@ -330,6 +413,10 @@ static struct test_case_desc test_case_array[] = {
 	{ "random_size_align_alloc_test", random_size_align_alloc_test },
 	{ "align_shift_alloc_test", align_shift_alloc_test },
 	{ "pcpu_alloc_test", pcpu_alloc_test },
+	{ "kvfree_rcu_1_arg_vmalloc_test", kvfree_rcu_1_arg_vmalloc_test },
+	{ "kvfree_rcu_2_arg_vmalloc_test", kvfree_rcu_2_arg_vmalloc_test },
+	{ "kvfree_rcu_1_arg_slab_test", kvfree_rcu_1_arg_slab_test },
+	{ "kvfree_rcu_2_arg_slab_test", kvfree_rcu_2_arg_slab_test },
 	/* Add a new test case here. */
 };
 

From ea6eed9f7d7382c7230202d4c3bf74185f193394 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Thu, 7 May 2020 16:47:13 -0700
Subject: [PATCH 127/502] rcu-tasks: Convert sleeps to idle priority

This commit converts the long-standing schedule_timeout_interruptible()
and schedule_timeout_uninterruptible() calls used by the various Tasks
RCU's grace-period kthreads to schedule_timeout_idle().  This conversion
avoids polluting the load-average with Tasks-RCU-related sleeping.

Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 kernel/rcu/tasks.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h
index ce23f6cc5043..91fee8122acd 100644
--- a/kernel/rcu/tasks.h
+++ b/kernel/rcu/tasks.h
@@ -205,7 +205,7 @@ static int __noreturn rcu_tasks_kthread(void *arg)
 			if (!rtp->cbs_head) {
 				WARN_ON(signal_pending(current));
 				set_tasks_gp_state(rtp, RTGS_WAIT_WAIT_CBS);
-				schedule_timeout_interruptible(HZ/10);
+				schedule_timeout_idle(HZ/10);
 			}
 			continue;
 		}
@@ -227,7 +227,7 @@ static int __noreturn rcu_tasks_kthread(void *arg)
 			cond_resched();
 		}
 		/* Paranoid sleep to keep this from entering a tight loop */
-		schedule_timeout_uninterruptible(HZ/10);
+		schedule_timeout_idle(HZ/10);
 
 		set_tasks_gp_state(rtp, RTGS_WAIT_CBS);
 	}
@@ -336,7 +336,7 @@ static void rcu_tasks_wait_gp(struct rcu_tasks *rtp)
 
 		/* Slowly back off waiting for holdouts */
 		set_tasks_gp_state(rtp, RTGS_WAIT_SCAN_HOLDOUTS);
-		schedule_timeout_interruptible(HZ/fract);
+		schedule_timeout_idle(HZ/fract);
 
 		if (fract > 1)
 			fract--;

From 04a3c5aa7a8cb2ce97f9beb627ba742bc8b0fe03 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Thu, 28 May 2020 19:27:06 -0700
Subject: [PATCH 128/502] rcu-tasks: Make rcu_tasks_postscan() be static

The rcu_tasks_postscan() function is not used outside of RCU's tasks.h
file, so this commit makes it be static.

Reported-by: kbuild test robot <lkp@intel.com>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 kernel/rcu/tasks.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h
index 91fee8122acd..da200e53d60d 100644
--- a/kernel/rcu/tasks.h
+++ b/kernel/rcu/tasks.h
@@ -402,7 +402,7 @@ static void rcu_tasks_pertask(struct task_struct *t, struct list_head *hop)
 }
 
 /* Processing between scanning taskslist and draining the holdout list. */
-void rcu_tasks_postscan(struct list_head *hop)
+static void rcu_tasks_postscan(struct list_head *hop)
 {
 	/*
 	 * Wait for tasks that are in the process of exiting.  This

From 5b3cc99bedf5885055fbaf35fe63d205f06b5be5 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Thu, 28 May 2020 19:33:47 -0700
Subject: [PATCH 129/502] rcu-tasks: Add #include of rcupdate_trace.h to
 update.c

Although this is in some strict sense unnecessary, it is good to allow
the compiler to compare the function declaration with its definition.
This commit therefore adds a #include of linux/rcupdate_trace.h to
kernel/rcu/update.c.

Reported-by: kbuild test robot <lkp@intel.com>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 kernel/rcu/update.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c
index 84843adfd939..c0fea809d738 100644
--- a/kernel/rcu/update.c
+++ b/kernel/rcu/update.c
@@ -42,6 +42,7 @@
 #include <linux/kprobes.h>
 #include <linux/slab.h>
 #include <linux/irq_work.h>
+#include <linux/rcupdate_trace.h>
 
 #define CREATE_TRACE_POINTS
 

From 8344496e8b49c4122c1808d6cd3f8dc71bccb595 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Thu, 28 May 2020 20:03:48 -0700
Subject: [PATCH 130/502] rcu-tasks: Conditionally compile
 show_rcu_tasks_gp_kthreads()

The show_rcu_tasks_gp_kthreads() function is not invoked by Tiny RCU,
but is nevertheless defined in Tiny RCU builds that enable Tasks Trace
RCU.  This commit therefore conditionally compiles this function so
that it is defined only in builds that actually use it.

Reported-by: kbuild test robot <lkp@intel.com>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 kernel/rcu/tasks.h | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h
index da200e53d60d..d5c003c1972c 100644
--- a/kernel/rcu/tasks.h
+++ b/kernel/rcu/tasks.h
@@ -103,6 +103,7 @@ module_param(rcu_task_stall_timeout, int, 0644);
 #define RTGS_WAIT_READERS	 9
 #define RTGS_INVOKE_CBS		10
 #define RTGS_WAIT_CBS		11
+#ifndef CONFIG_TINY_RCU
 static const char * const rcu_tasks_gp_state_names[] = {
 	"RTGS_INIT",
 	"RTGS_WAIT_WAIT_CBS",
@@ -117,6 +118,7 @@ static const char * const rcu_tasks_gp_state_names[] = {
 	"RTGS_INVOKE_CBS",
 	"RTGS_WAIT_CBS",
 };
+#endif /* #ifndef CONFIG_TINY_RCU */
 
 ////////////////////////////////////////////////////////////////////////
 //
@@ -129,6 +131,7 @@ static void set_tasks_gp_state(struct rcu_tasks *rtp, int newstate)
 	rtp->gp_jiffies = jiffies;
 }
 
+#ifndef CONFIG_TINY_RCU
 /* Return state name. */
 static const char *tasks_gp_state_getname(struct rcu_tasks *rtp)
 {
@@ -139,6 +142,7 @@ static const char *tasks_gp_state_getname(struct rcu_tasks *rtp)
 		return "???";
 	return rcu_tasks_gp_state_names[j];
 }
+#endif /* #ifndef CONFIG_TINY_RCU */
 
 // Enqueue a callback for the specified flavor of Tasks RCU.
 static void call_rcu_tasks_generic(struct rcu_head *rhp, rcu_callback_t func,
@@ -268,6 +272,7 @@ static void __init rcu_tasks_bootup_oddness(void)
 
 #endif /* #ifndef CONFIG_TINY_RCU */
 
+#ifndef CONFIG_TINY_RCU
 /* Dump out rcutorture-relevant state common to all RCU-tasks flavors. */
 static void show_rcu_tasks_generic_gp_kthread(struct rcu_tasks *rtp, char *s)
 {
@@ -281,6 +286,7 @@ static void show_rcu_tasks_generic_gp_kthread(struct rcu_tasks *rtp, char *s)
 		".C"[!!data_race(rtp->cbs_head)],
 		s);
 }
+#endif /* #ifndef CONFIG_TINY_RCU */
 
 static void exit_tasks_rcu_finish_trace(struct task_struct *t);
 
@@ -557,10 +563,12 @@ static int __init rcu_spawn_tasks_kthread(void)
 }
 core_initcall(rcu_spawn_tasks_kthread);
 
+#ifndef CONFIG_TINY_RCU
 static void show_rcu_tasks_classic_gp_kthread(void)
 {
 	show_rcu_tasks_generic_gp_kthread(&rcu_tasks, "");
 }
+#endif /* #ifndef CONFIG_TINY_RCU */
 
 /* Do the srcu_read_lock() for the above synchronize_srcu().  */
 void exit_tasks_rcu_start(void) __acquires(&tasks_rcu_exit_srcu)
@@ -682,10 +690,12 @@ static int __init rcu_spawn_tasks_rude_kthread(void)
 }
 core_initcall(rcu_spawn_tasks_rude_kthread);
 
+#ifndef CONFIG_TINY_RCU
 static void show_rcu_tasks_rude_gp_kthread(void)
 {
 	show_rcu_tasks_generic_gp_kthread(&rcu_tasks_rude, "");
 }
+#endif /* #ifndef CONFIG_TINY_RCU */
 
 #else /* #ifdef CONFIG_TASKS_RUDE_RCU */
 static void show_rcu_tasks_rude_gp_kthread(void) {}
@@ -1164,6 +1174,7 @@ static int __init rcu_spawn_tasks_trace_kthread(void)
 }
 core_initcall(rcu_spawn_tasks_trace_kthread);
 
+#ifndef CONFIG_TINY_RCU
 static void show_rcu_tasks_trace_gp_kthread(void)
 {
 	char buf[64];
@@ -1174,18 +1185,21 @@ static void show_rcu_tasks_trace_gp_kthread(void)
 		data_race(n_heavy_reader_attempts));
 	show_rcu_tasks_generic_gp_kthread(&rcu_tasks_trace, buf);
 }
+#endif /* #ifndef CONFIG_TINY_RCU */
 
 #else /* #ifdef CONFIG_TASKS_TRACE_RCU */
 static void exit_tasks_rcu_finish_trace(struct task_struct *t) { }
 static inline void show_rcu_tasks_trace_gp_kthread(void) {}
 #endif /* #else #ifdef CONFIG_TASKS_TRACE_RCU */
 
+#ifndef CONFIG_TINY_RCU
 void show_rcu_tasks_gp_kthreads(void)
 {
 	show_rcu_tasks_classic_gp_kthread();
 	show_rcu_tasks_rude_gp_kthread();
 	show_rcu_tasks_trace_gp_kthread();
 }
+#endif /* #ifndef CONFIG_TINY_RCU */
 
 #else /* #ifdef CONFIG_TASKS_RCU_GENERIC */
 static inline void rcu_tasks_bootup_oddness(void) {}

From 30d8aa5128f12c9d781b67c9694c1abfa4f6ce6a Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Tue, 9 Jun 2020 09:24:51 -0700
Subject: [PATCH 131/502] rcu-tasks: Fix code-style issues

This commit declares trc_n_readers_need_end and trc_wait static and
replaced a "&" with "&&".  The "&" happened to work because the values
are bool, but accidents waiting to happen and all that...

Reported-by: kbuild test robot <lkp@intel.com>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 kernel/rcu/tasks.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h
index d5c003c1972c..828f222895f1 100644
--- a/kernel/rcu/tasks.h
+++ b/kernel/rcu/tasks.h
@@ -737,8 +737,8 @@ EXPORT_SYMBOL_GPL(rcu_trace_lock_map);
 
 #ifdef CONFIG_TASKS_TRACE_RCU
 
-atomic_t trc_n_readers_need_end;	// Number of waited-for readers.
-DECLARE_WAIT_QUEUE_HEAD(trc_wait);	// List of holdout tasks.
+static atomic_t trc_n_readers_need_end;		// Number of waited-for readers.
+static DECLARE_WAIT_QUEUE_HEAD(trc_wait);	// List of holdout tasks.
 
 // Record outstanding IPIs to each CPU.  No point in sending two...
 static DEFINE_PER_CPU(bool, trc_ipi_to_cpu);
@@ -845,7 +845,7 @@ static bool trc_inspect_reader(struct task_struct *t, void *arg)
 	bool ofl = cpu_is_offline(cpu);
 
 	if (task_curr(t)) {
-		WARN_ON_ONCE(ofl & !is_idle_task(t));
+		WARN_ON_ONCE(ofl && !is_idle_task(t));
 
 		// If no chance of heavyweight readers, do it the hard way.
 		if (!ofl && !IS_ENABLED(CONFIG_TASKS_TRACE_RCU_READ_MB))

From 7e866460cc18797b3a59360f5f8c444598a21729 Mon Sep 17 00:00:00 2001
From: "Joel Fernandes (Google)" <joel@joelfernandes.org>
Date: Mon, 25 May 2020 00:36:47 -0400
Subject: [PATCH 132/502] rcuperf: Remove useless while loops around wait_event

wait_event() already retries if the condition for the wake up is not
satisifed after wake up. Remove them from the rcuperf test.

Signed-off-by: Joel Fernandes (Google) <joel@joelfernandes.org>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 kernel/rcu/rcuperf.c | 14 ++++----------
 1 file changed, 4 insertions(+), 10 deletions(-)

diff --git a/kernel/rcu/rcuperf.c b/kernel/rcu/rcuperf.c
index 16dd1e6b7c09..246da8fe199e 100644
--- a/kernel/rcu/rcuperf.c
+++ b/kernel/rcu/rcuperf.c
@@ -576,11 +576,8 @@ static int compute_real(int n)
 static int
 rcu_perf_shutdown(void *arg)
 {
-	do {
-		wait_event(shutdown_wq,
-			   atomic_read(&n_rcu_perf_writer_finished) >=
-			   nrealwriters);
-	} while (atomic_read(&n_rcu_perf_writer_finished) < nrealwriters);
+	wait_event(shutdown_wq,
+		   atomic_read(&n_rcu_perf_writer_finished) >= nrealwriters);
 	smp_mb(); /* Wake before output. */
 	rcu_perf_cleanup();
 	kernel_power_off();
@@ -693,11 +690,8 @@ kfree_perf_cleanup(void)
 static int
 kfree_perf_shutdown(void *arg)
 {
-	do {
-		wait_event(shutdown_wq,
-			   atomic_read(&n_kfree_perf_thread_ended) >=
-			   kfree_nrealthreads);
-	} while (atomic_read(&n_kfree_perf_thread_ended) < kfree_nrealthreads);
+	wait_event(shutdown_wq,
+		   atomic_read(&n_kfree_perf_thread_ended) >= kfree_nrealthreads);
 
 	smp_mb(); /* Wake before output. */
 

From 653ed64b01dc5989f8f579d0038e987476c2c023 Mon Sep 17 00:00:00 2001
From: "Joel Fernandes (Google)" <joel@joelfernandes.org>
Date: Mon, 25 May 2020 00:36:48 -0400
Subject: [PATCH 133/502] refperf: Add a test to measure performance of
 read-side synchronization

Add a test for comparing the performance of RCU with various read-side
synchronization mechanisms. The test has proved useful for collecting
data and performing these comparisons.

Currently RCU, SRCU, reader-writer lock, reader-writer semaphore and
reference counting can be measured using refperf.perf_type parameter.
Each invocation of the test runs measures performance of a specific
mechanism.

The maximum number of CPUs to concurrently run readers on is chosen by
the test itself and is 75% of the total number of CPUs. So if you had 24
CPUs, the test runs with a maximum of 18 parallel readers.

A number of experiments are conducted, and in each experiment, the
number of readers is increased by 1, upto the 75% of CPUs mark. During
each experiment, all readers execute an empty loop with refperf.loops
iterations and time the total loop duration. This is then averaged.

Example output:
Parameters "refperf.perf_type=srcu refperf.loops=2000000" looks like:

[    3.347133] srcu-ref-perf:
[    3.347133] Threads  Time(ns)
[    3.347133] 1        36
[    3.347133] 2        34
[    3.347133] 3        34
[    3.347133] 4        34
[    3.347133] 5        33
[    3.347133] 6        33
[    3.347133] 7        33
[    3.347133] 8        33
[    3.347133] 9        33
[    3.347133] 10       33
[    3.347133] 11       33
[    3.347133] 12       33
[    3.347133] 13       33
[    3.347133] 14       33
[    3.347133] 15       32
[    3.347133] 16       33
[    3.347133] 17       33
[    3.347133] 18       34

Signed-off-by: Joel Fernandes (Google) <joel@joelfernandes.org>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 kernel/rcu/Kconfig.debug |  19 ++
 kernel/rcu/Makefile      |   1 +
 kernel/rcu/refperf.c     | 558 +++++++++++++++++++++++++++++++++++++++
 3 files changed, 578 insertions(+)
 create mode 100644 kernel/rcu/refperf.c

diff --git a/kernel/rcu/Kconfig.debug b/kernel/rcu/Kconfig.debug
index 452feae8de20..858765b7f644 100644
--- a/kernel/rcu/Kconfig.debug
+++ b/kernel/rcu/Kconfig.debug
@@ -61,6 +61,25 @@ config RCU_TORTURE_TEST
 	  Say M if you want the RCU torture tests to build as a module.
 	  Say N if you are unsure.
 
+config RCU_REF_PERF_TEST
+	tristate "Performance tests for read-side synchronization (RCU and others)"
+	depends on DEBUG_KERNEL
+	select TORTURE_TEST
+	select SRCU
+	select TASKS_RCU
+	select TASKS_RUDE_RCU
+	select TASKS_TRACE_RCU
+	default n
+	help
+	  This option provides a kernel module that runs performance tests
+	  useful comparing RCU with various read-side synchronization mechanisms.
+	  The kernel module may be built after the fact on the running kernel to be
+	  tested, if desired.
+
+	  Say Y here if you want these performance tests built into the kernel.
+	  Say M if you want to build it as a module instead.
+	  Say N if you are unsure.
+
 config RCU_CPU_STALL_TIMEOUT
 	int "RCU CPU stall timeout in seconds"
 	depends on RCU_STALL_COMMON
diff --git a/kernel/rcu/Makefile b/kernel/rcu/Makefile
index f91f2c2cf138..ba7d82609cbe 100644
--- a/kernel/rcu/Makefile
+++ b/kernel/rcu/Makefile
@@ -12,6 +12,7 @@ obj-$(CONFIG_TREE_SRCU) += srcutree.o
 obj-$(CONFIG_TINY_SRCU) += srcutiny.o
 obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o
 obj-$(CONFIG_RCU_PERF_TEST) += rcuperf.o
+obj-$(CONFIG_RCU_REF_PERF_TEST) += refperf.o
 obj-$(CONFIG_TREE_RCU) += tree.o
 obj-$(CONFIG_TINY_RCU) += tiny.o
 obj-$(CONFIG_RCU_NEED_SEGCBLIST) += rcu_segcblist.o
diff --git a/kernel/rcu/refperf.c b/kernel/rcu/refperf.c
new file mode 100644
index 000000000000..61161530acc8
--- /dev/null
+++ b/kernel/rcu/refperf.c
@@ -0,0 +1,558 @@
+// SPDX-License-Identifier: GPL-2.0+
+//
+// Performance test comparing RCU vs other mechanisms
+// for acquiring references on objects.
+//
+// Copyright (C) Google, 2020.
+//
+// Author: Joel Fernandes <joel@joelfernandes.org>
+
+#define pr_fmt(fmt) fmt
+
+#include <linux/atomic.h>
+#include <linux/bitops.h>
+#include <linux/completion.h>
+#include <linux/cpu.h>
+#include <linux/delay.h>
+#include <linux/err.h>
+#include <linux/init.h>
+#include <linux/interrupt.h>
+#include <linux/kthread.h>
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/module.h>
+#include <linux/moduleparam.h>
+#include <linux/notifier.h>
+#include <linux/percpu.h>
+#include <linux/rcupdate.h>
+#include <linux/reboot.h>
+#include <linux/sched.h>
+#include <linux/spinlock.h>
+#include <linux/smp.h>
+#include <linux/stat.h>
+#include <linux/srcu.h>
+#include <linux/slab.h>
+#include <linux/torture.h>
+#include <linux/types.h>
+
+#include "rcu.h"
+
+#define PERF_FLAG "-ref-perf: "
+
+#define PERFOUT(s, x...) \
+	pr_alert("%s" PERF_FLAG s, perf_type, ## x)
+
+#define VERBOSE_PERFOUT(s, x...) \
+	do { if (verbose) pr_alert("%s" PERF_FLAG s, perf_type, ## x); } while (0)
+
+#define VERBOSE_PERFOUT_ERRSTRING(s, x...) \
+	do { if (verbose) pr_alert("%s" PERF_FLAG "!!! " s, perf_type, ## x); } while (0)
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Joel Fernandes (Google) <joel@joelfernandes.org>");
+
+static char *perf_type = "rcu";
+module_param(perf_type, charp, 0444);
+MODULE_PARM_DESC(perf_type, "Type of test (rcu, srcu, refcnt, rwsem, rwlock.");
+
+torture_param(int, verbose, 0, "Enable verbose debugging printk()s");
+
+// Number of loops per experiment, all readers execute an operation concurrently
+torture_param(long, loops, 10000000, "Number of loops per experiment.");
+
+#ifdef MODULE
+# define REFPERF_SHUTDOWN 0
+#else
+# define REFPERF_SHUTDOWN 1
+#endif
+
+torture_param(bool, shutdown, REFPERF_SHUTDOWN,
+	      "Shutdown at end of performance tests.");
+
+struct reader_task {
+	struct task_struct *task;
+	atomic_t start;
+	wait_queue_head_t wq;
+	u64 last_duration_ns;
+
+	// The average latency When 1..<this reader> are concurrently
+	// running an experiment. For example, if this reader_task is
+	// of index 5 in the reader_tasks array, then result is for
+	// 6 cores.
+	u64 result_avg;
+};
+
+static struct task_struct *shutdown_task;
+static wait_queue_head_t shutdown_wq;
+
+static struct task_struct *main_task;
+static wait_queue_head_t main_wq;
+static int shutdown_start;
+
+static struct reader_task *reader_tasks;
+static int nreaders;
+
+// Number of readers that are part of the current experiment.
+static atomic_t nreaders_exp;
+
+// Use to wait for all threads to start.
+static atomic_t n_init;
+
+// Track which experiment is currently running.
+static int exp_idx;
+
+// Operations vector for selecting different types of tests.
+struct ref_perf_ops {
+	void (*init)(void);
+	void (*cleanup)(void);
+	int (*readlock)(void);
+	void (*readunlock)(int idx);
+	const char *name;
+};
+
+static struct ref_perf_ops *cur_ops;
+
+// Definitions for RCU ref perf testing.
+static int ref_rcu_read_lock(void) __acquires(RCU)
+{
+	rcu_read_lock();
+	return 0;
+}
+
+static void ref_rcu_read_unlock(int idx) __releases(RCU)
+{
+	rcu_read_unlock();
+}
+
+static void rcu_sync_perf_init(void)
+{
+}
+
+static struct ref_perf_ops rcu_ops = {
+	.init		= rcu_sync_perf_init,
+	.readlock	= ref_rcu_read_lock,
+	.readunlock	= ref_rcu_read_unlock,
+	.name		= "rcu"
+};
+
+
+// Definitions for SRCU ref perf testing.
+DEFINE_STATIC_SRCU(srcu_refctl_perf);
+static struct srcu_struct *srcu_ctlp = &srcu_refctl_perf;
+
+static int srcu_ref_perf_read_lock(void) __acquires(srcu_ctlp)
+{
+	return srcu_read_lock(srcu_ctlp);
+}
+
+static void srcu_ref_perf_read_unlock(int idx) __releases(srcu_ctlp)
+{
+	srcu_read_unlock(srcu_ctlp, idx);
+}
+
+static struct ref_perf_ops srcu_ops = {
+	.init		= rcu_sync_perf_init,
+	.readlock	= srcu_ref_perf_read_lock,
+	.readunlock	= srcu_ref_perf_read_unlock,
+	.name		= "srcu"
+};
+
+// Definitions for reference count
+static atomic_t refcnt;
+
+static int srcu_ref_perf_refcnt_lock(void)
+{
+	atomic_inc(&refcnt);
+	return 0;
+}
+
+static void srcu_ref_perf_refcnt_unlock(int idx) __releases(srcu_ctlp)
+{
+	atomic_dec(&refcnt);
+	srcu_read_unlock(srcu_ctlp, idx);
+}
+
+static struct ref_perf_ops refcnt_ops = {
+	.init		= rcu_sync_perf_init,
+	.readlock	= srcu_ref_perf_refcnt_lock,
+	.readunlock	= srcu_ref_perf_refcnt_unlock,
+	.name		= "refcnt"
+};
+
+// Definitions for rwlock
+static rwlock_t test_rwlock;
+
+static void ref_perf_rwlock_init(void)
+{
+	rwlock_init(&test_rwlock);
+}
+
+static int ref_perf_rwlock_lock(void)
+{
+	read_lock(&test_rwlock);
+	return 0;
+}
+
+static void ref_perf_rwlock_unlock(int idx)
+{
+	read_unlock(&test_rwlock);
+}
+
+static struct ref_perf_ops rwlock_ops = {
+	.init		= ref_perf_rwlock_init,
+	.readlock	= ref_perf_rwlock_lock,
+	.readunlock	= ref_perf_rwlock_unlock,
+	.name		= "rwlock"
+};
+
+// Definitions for rwsem
+static struct rw_semaphore test_rwsem;
+
+static void ref_perf_rwsem_init(void)
+{
+	init_rwsem(&test_rwsem);
+}
+
+static int ref_perf_rwsem_lock(void)
+{
+	down_read(&test_rwsem);
+	return 0;
+}
+
+static void ref_perf_rwsem_unlock(int idx)
+{
+	up_read(&test_rwsem);
+}
+
+static struct ref_perf_ops rwsem_ops = {
+	.init		= ref_perf_rwsem_init,
+	.readlock	= ref_perf_rwsem_lock,
+	.readunlock	= ref_perf_rwsem_unlock,
+	.name		= "rwsem"
+};
+
+// Reader kthread.  Repeatedly does empty RCU read-side
+// critical section, minimizing update-side interference.
+static int
+ref_perf_reader(void *arg)
+{
+	unsigned long flags;
+	long me = (long)arg;
+	struct reader_task *rt = &(reader_tasks[me]);
+	unsigned long spincnt;
+	int idx;
+	u64 start;
+	s64 duration;
+
+	VERBOSE_PERFOUT("ref_perf_reader %ld: task started", me);
+	set_cpus_allowed_ptr(current, cpumask_of(me % nr_cpu_ids));
+	set_user_nice(current, MAX_NICE);
+	atomic_inc(&n_init);
+repeat:
+	VERBOSE_PERFOUT("ref_perf_reader %ld: waiting to start next experiment on cpu %d", me, smp_processor_id());
+
+	// Wait for signal that this reader can start.
+	wait_event(rt->wq, (atomic_read(&nreaders_exp) && atomic_read(&rt->start)) ||
+			   torture_must_stop());
+
+	if (torture_must_stop())
+		goto end;
+
+	// Make sure that the CPU is affinitized appropriately during testing.
+	WARN_ON_ONCE(smp_processor_id() != me);
+
+	atomic_dec(&rt->start);
+
+	// To prevent noise, keep interrupts disabled. This also has the
+	// effect of preventing entries into slow path for rcu_read_unlock().
+	local_irq_save(flags);
+	start = ktime_get_mono_fast_ns();
+
+	VERBOSE_PERFOUT("ref_perf_reader %ld: experiment %d started", me, exp_idx);
+
+	for (spincnt = 0; spincnt < loops; spincnt++) {
+		idx = cur_ops->readlock();
+		cur_ops->readunlock(idx);
+	}
+
+	duration = ktime_get_mono_fast_ns() - start;
+	local_irq_restore(flags);
+
+	rt->last_duration_ns = WARN_ON_ONCE(duration < 0) ? 0 : duration;
+
+	atomic_dec(&nreaders_exp);
+
+	VERBOSE_PERFOUT("ref_perf_reader %ld: experiment %d ended, (readers remaining=%d)",
+			me, exp_idx, atomic_read(&nreaders_exp));
+
+	if (!atomic_read(&nreaders_exp))
+		wake_up(&main_wq);
+
+	if (!torture_must_stop())
+		goto repeat;
+end:
+	torture_kthread_stopping("ref_perf_reader");
+	return 0;
+}
+
+void reset_readers(int n)
+{
+	int i;
+	struct reader_task *rt;
+
+	for (i = 0; i < n; i++) {
+		rt = &(reader_tasks[i]);
+
+		rt->last_duration_ns = 0;
+	}
+}
+
+// Print the results of each reader and return the sum of all their durations.
+u64 process_durations(int n)
+{
+	int i;
+	struct reader_task *rt;
+	char buf1[64];
+	char buf[512];
+	u64 sum = 0;
+
+	buf[0] = 0;
+	sprintf(buf, "Experiment #%d (Format: <THREAD-NUM>:<Total loop time in ns>)",
+		exp_idx);
+
+	for (i = 0; i <= n && !torture_must_stop(); i++) {
+		rt = &(reader_tasks[i]);
+		sprintf(buf1, "%d: %llu\t", i, rt->last_duration_ns);
+
+		if (i % 5 == 0)
+			strcat(buf, "\n");
+		strcat(buf, buf1);
+
+		sum += rt->last_duration_ns;
+	}
+	strcat(buf, "\n");
+
+	PERFOUT("%s\n", buf);
+
+	return sum;
+}
+
+// The main_func is the main orchestrator, it performs a bunch of
+// experiments.  For every experiment, it orders all the readers
+// involved to start and waits for them to finish the experiment. It
+// then reads their timestamps and starts the next experiment. Each
+// experiment progresses from 1 concurrent reader to N of them at which
+// point all the timestamps are printed.
+static int main_func(void *arg)
+{
+	int exp, r;
+	char buf1[64];
+	char buf[512];
+
+	set_cpus_allowed_ptr(current, cpumask_of(nreaders % nr_cpu_ids));
+	set_user_nice(current, MAX_NICE);
+
+	VERBOSE_PERFOUT("main_func task started");
+	atomic_inc(&n_init);
+
+	// Wait for all threads to start.
+	wait_event(main_wq, atomic_read(&n_init) == (nreaders + 1));
+
+	// Start exp readers up per experiment
+	for (exp = 0; exp < nreaders && !torture_must_stop(); exp++) {
+		if (torture_must_stop())
+			goto end;
+
+		reset_readers(exp);
+		atomic_set(&nreaders_exp, exp + 1);
+
+		exp_idx = exp;
+
+		for (r = 0; r <= exp; r++) {
+			atomic_set(&reader_tasks[r].start, 1);
+			wake_up(&reader_tasks[r].wq);
+		}
+
+		VERBOSE_PERFOUT("main_func: experiment started, waiting for %d readers",
+				exp);
+
+		wait_event(main_wq,
+			   !atomic_read(&nreaders_exp) || torture_must_stop());
+
+		VERBOSE_PERFOUT("main_func: experiment ended");
+
+		if (torture_must_stop())
+			goto end;
+
+		reader_tasks[exp].result_avg = process_durations(exp) / ((exp + 1) * loops);
+	}
+
+	// Print the average of all experiments
+	PERFOUT("END OF TEST. Calculating average duration per loop (nanoseconds)...\n");
+
+	buf[0] = 0;
+	strcat(buf, "\n");
+	strcat(buf, "Threads\tTime(ns)\n");
+
+	for (exp = 0; exp < nreaders; exp++) {
+		sprintf(buf1, "%d\t%llu\n", exp + 1, reader_tasks[exp].result_avg);
+		strcat(buf, buf1);
+	}
+
+	PERFOUT("%s", buf);
+
+	// This will shutdown everything including us.
+	if (shutdown) {
+		shutdown_start = 1;
+		wake_up(&shutdown_wq);
+	}
+
+	// Wait for torture to stop us
+	while (!torture_must_stop())
+		schedule_timeout_uninterruptible(1);
+
+end:
+	torture_kthread_stopping("main_func");
+	return 0;
+}
+
+static void
+ref_perf_print_module_parms(struct ref_perf_ops *cur_ops, const char *tag)
+{
+	pr_alert("%s" PERF_FLAG
+		 "--- %s:  verbose=%d shutdown=%d loops=%ld\n", perf_type, tag,
+		 verbose, shutdown, loops);
+}
+
+static void
+ref_perf_cleanup(void)
+{
+	int i;
+
+	if (torture_cleanup_begin())
+		return;
+
+	if (!cur_ops) {
+		torture_cleanup_end();
+		return;
+	}
+
+	if (reader_tasks) {
+		for (i = 0; i < nreaders; i++)
+			torture_stop_kthread("ref_perf_reader",
+					     reader_tasks[i].task);
+	}
+	kfree(reader_tasks);
+
+	torture_stop_kthread("main_task", main_task);
+	kfree(main_task);
+
+	// Do perf-type-specific cleanup operations.
+	if (cur_ops->cleanup != NULL)
+		cur_ops->cleanup();
+
+	torture_cleanup_end();
+}
+
+// Shutdown kthread.  Just waits to be awakened, then shuts down system.
+static int
+ref_perf_shutdown(void *arg)
+{
+	wait_event(shutdown_wq, shutdown_start);
+
+	smp_mb(); // Wake before output.
+	ref_perf_cleanup();
+	kernel_power_off();
+
+	return -EINVAL;
+}
+
+static int __init
+ref_perf_init(void)
+{
+	long i;
+	int firsterr = 0;
+	static struct ref_perf_ops *perf_ops[] = {
+		&rcu_ops, &srcu_ops, &refcnt_ops, &rwlock_ops, &rwsem_ops,
+	};
+
+	if (!torture_init_begin(perf_type, verbose))
+		return -EBUSY;
+
+	for (i = 0; i < ARRAY_SIZE(perf_ops); i++) {
+		cur_ops = perf_ops[i];
+		if (strcmp(perf_type, cur_ops->name) == 0)
+			break;
+	}
+	if (i == ARRAY_SIZE(perf_ops)) {
+		pr_alert("rcu-perf: invalid perf type: \"%s\"\n", perf_type);
+		pr_alert("rcu-perf types:");
+		for (i = 0; i < ARRAY_SIZE(perf_ops); i++)
+			pr_cont(" %s", perf_ops[i]->name);
+		pr_cont("\n");
+		WARN_ON(!IS_MODULE(CONFIG_RCU_REF_PERF_TEST));
+		firsterr = -EINVAL;
+		cur_ops = NULL;
+		goto unwind;
+	}
+	if (cur_ops->init)
+		cur_ops->init();
+
+	ref_perf_print_module_parms(cur_ops, "Start of test");
+
+	// Shutdown task
+	if (shutdown) {
+		init_waitqueue_head(&shutdown_wq);
+		firsterr = torture_create_kthread(ref_perf_shutdown, NULL,
+						  shutdown_task);
+		if (firsterr)
+			goto unwind;
+		schedule_timeout_uninterruptible(1);
+	}
+
+	// Reader tasks (~75% of online CPUs).
+	nreaders = (num_online_cpus() >> 1) + (num_online_cpus() >> 2);
+	reader_tasks = kcalloc(nreaders, sizeof(reader_tasks[0]),
+			       GFP_KERNEL);
+	if (!reader_tasks) {
+		VERBOSE_PERFOUT_ERRSTRING("out of memory");
+		firsterr = -ENOMEM;
+		goto unwind;
+	}
+
+	VERBOSE_PERFOUT("Starting %d reader threads\n", nreaders);
+
+	for (i = 0; i < nreaders; i++) {
+		firsterr = torture_create_kthread(ref_perf_reader, (void *)i,
+						  reader_tasks[i].task);
+		if (firsterr)
+			goto unwind;
+
+		init_waitqueue_head(&(reader_tasks[i].wq));
+	}
+
+	// Main Task
+	init_waitqueue_head(&main_wq);
+	firsterr = torture_create_kthread(main_func, NULL, main_task);
+	if (firsterr)
+		goto unwind;
+	schedule_timeout_uninterruptible(1);
+
+
+	// Wait until all threads start
+	while (atomic_read(&n_init) < nreaders + 1)
+		schedule_timeout_uninterruptible(1);
+
+	wake_up(&main_wq);
+
+	torture_init_end();
+	return 0;
+
+unwind:
+	torture_init_end();
+	ref_perf_cleanup();
+	return firsterr;
+}
+
+module_init(ref_perf_init);
+module_exit(ref_perf_cleanup);

From 708cda31652c02e64adaeafafe7b996e4e14c3eb Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Mon, 25 May 2020 09:22:24 -0700
Subject: [PATCH 134/502] rcuperf: Add comments explaining the high reader
 overhead

This commit adds comments explaining why the readers have otherwise insane
levels of measurement overhead, namely that they are intended as a test
load for update-side performance measurements, not as a straight-up
read-side performance test.

Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 kernel/rcu/rcuperf.c | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/kernel/rcu/rcuperf.c b/kernel/rcu/rcuperf.c
index 246da8fe199e..d906ca987936 100644
--- a/kernel/rcu/rcuperf.c
+++ b/kernel/rcu/rcuperf.c
@@ -69,6 +69,11 @@ MODULE_AUTHOR("Paul E. McKenney <paulmck@linux.ibm.com>");
  *	value specified by nr_cpus for a read-only test.
  *
  * Various other use cases may of course be specified.
+ *
+ * Note that this test's readers are intended only as a test load for
+ * the writers.  The reader performance statistics will be overly
+ * pessimistic due to the per-critical-section interrupt disabling,
+ * test-end checks, and the pair of calls through pointers.
  */
 
 #ifdef MODULE
@@ -309,8 +314,10 @@ static void rcu_perf_wait_shutdown(void)
 }
 
 /*
- * RCU perf reader kthread.  Repeatedly does empty RCU read-side
- * critical section, minimizing update-side interference.
+ * RCU perf reader kthread.  Repeatedly does empty RCU read-side critical
+ * section, minimizing update-side interference.  However, the point of
+ * this test is not to evaluate reader performance, but instead to serve
+ * as a test load for update-side performance testing.
  */
 static int
 rcu_perf_reader(void *arg)

From f8b4bb23ec014a5d16663ad70b45d9f46c456ec4 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Mon, 25 May 2020 14:07:52 -0700
Subject: [PATCH 135/502] torture: Add refperf to the rcutorture scripting

This commit updates the rcutorture scripting to include the new refperf
torture-test module.

Cc: Joel Fernandes (Google) <joel@joelfernandes.org>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 .../rcutorture/bin/kvm-recheck-refperf.sh     | 67 +++++++++++++++++++
 tools/testing/selftests/rcutorture/bin/kvm.sh |  9 +--
 .../selftests/rcutorture/bin/parse-console.sh |  4 +-
 .../rcutorture/configs/refperf/CFLIST         |  2 +
 .../rcutorture/configs/refperf/CFcommon       |  2 +
 .../rcutorture/configs/refperf/NOPREEMPT      | 18 +++++
 .../rcutorture/configs/refperf/PREEMPT        | 18 +++++
 .../configs/refperf/ver_functions.sh          | 16 +++++
 8 files changed, 130 insertions(+), 6 deletions(-)
 create mode 100755 tools/testing/selftests/rcutorture/bin/kvm-recheck-refperf.sh
 create mode 100644 tools/testing/selftests/rcutorture/configs/refperf/CFLIST
 create mode 100644 tools/testing/selftests/rcutorture/configs/refperf/CFcommon
 create mode 100644 tools/testing/selftests/rcutorture/configs/refperf/NOPREEMPT
 create mode 100644 tools/testing/selftests/rcutorture/configs/refperf/PREEMPT
 create mode 100644 tools/testing/selftests/rcutorture/configs/refperf/ver_functions.sh

diff --git a/tools/testing/selftests/rcutorture/bin/kvm-recheck-refperf.sh b/tools/testing/selftests/rcutorture/bin/kvm-recheck-refperf.sh
new file mode 100755
index 000000000000..6fc06cd3538e
--- /dev/null
+++ b/tools/testing/selftests/rcutorture/bin/kvm-recheck-refperf.sh
@@ -0,0 +1,67 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0+
+#
+# Analyze a given results directory for refperf performance measurements.
+#
+# Usage: kvm-recheck-refperf.sh resdir
+#
+# Copyright (C) IBM Corporation, 2016
+#
+# Authors: Paul E. McKenney <paulmck@linux.ibm.com>
+
+i="$1"
+if test -d "$i" -a -r "$i"
+then
+	:
+else
+	echo Unreadable results directory: $i
+	exit 1
+fi
+PATH=`pwd`/tools/testing/selftests/rcutorture/bin:$PATH; export PATH
+. functions.sh
+
+configfile=`echo $i | sed -e 's/^.*\///'`
+
+sed -e 's/^\[[^]]*]//' < $i/console.log | tr -d '\015' |
+awk -v configfile="$configfile" '
+/^[ 	]*Threads	Time\(ns\) *$/ {
+	if (dataphase + 0 == 0) {
+		dataphase = 1;
+		# print configfile, $0;
+	}
+	next;
+}
+
+/[^ 	]*[0-9][0-9]*	[0-9][0-9]*\.[0-9][0-9]*$/ {
+	if (dataphase == 1) {
+		# print $0;
+		readertimes[++n] = $2;
+		sum += $2;
+	}
+	next;
+}
+
+{
+	if (dataphase == 1)
+		dataphase == 2;
+	next;
+}
+
+END {
+	print configfile " results:";
+	newNR = asort(readertimes);
+	if (newNR <= 0) {
+		print "No refperf records found???"
+		exit;
+	}
+	medianidx = int(newNR / 2);
+	if (newNR == medianidx * 2)
+		medianvalue = (readertimes[medianidx - 1] + readertimes[medianidx]) / 2;
+	else
+		medianvalue = readertimes[medianidx];
+	print "Average reader duration: " sum / newNR " nanoseconds";
+	print "Minimum reader duration: " readertimes[1];
+	print "Median reader duration: " medianvalue;
+	print "Maximum reader duration: " readertimes[newNR];
+	print "Computed from refperf printk output.";
+}'
diff --git a/tools/testing/selftests/rcutorture/bin/kvm.sh b/tools/testing/selftests/rcutorture/bin/kvm.sh
index c279cf9cb010..48b6a7248f50 100755
--- a/tools/testing/selftests/rcutorture/bin/kvm.sh
+++ b/tools/testing/selftests/rcutorture/bin/kvm.sh
@@ -180,13 +180,14 @@ do
 		shift
 		;;
 	--torture)
-		checkarg --torture "(suite name)" "$#" "$2" '^\(lock\|rcu\|rcuperf\)$' '^--'
+		checkarg --torture "(suite name)" "$#" "$2" '^\(lock\|rcu\|rcuperf\|refperf\)$' '^--'
 		TORTURE_SUITE=$2
 		shift
-		if test "$TORTURE_SUITE" = rcuperf
+		if test "$TORTURE_SUITE" = rcuperf || test "$TORTURE_SUITE" = refperf
 		then
-			# If you really want jitter for rcuperf, specify
-			# it after specifying rcuperf.  (But why?)
+			# If you really want jitter for refperf or
+			# rcuperf, specify it after specifying the rcuperf
+			# or the refperf.  (But why jitter in these cases?)
 			jitter=0
 		fi
 		;;
diff --git a/tools/testing/selftests/rcutorture/bin/parse-console.sh b/tools/testing/selftests/rcutorture/bin/parse-console.sh
index 4bf62d7b1cbc..85af11d2d0cb 100755
--- a/tools/testing/selftests/rcutorture/bin/parse-console.sh
+++ b/tools/testing/selftests/rcutorture/bin/parse-console.sh
@@ -33,8 +33,8 @@ then
 fi
 cat /dev/null > $file.diags
 
-# Check for proper termination, except that rcuperf runs don't indicate this.
-if test "$TORTURE_SUITE" != rcuperf
+# Check for proper termination, except for rcuperf and refperf.
+if test "$TORTURE_SUITE" != rcuperf && test "$TORTURE_SUITE" != refperf
 then
 	# check for abject failure
 
diff --git a/tools/testing/selftests/rcutorture/configs/refperf/CFLIST b/tools/testing/selftests/rcutorture/configs/refperf/CFLIST
new file mode 100644
index 000000000000..4d62eb4a39f9
--- /dev/null
+++ b/tools/testing/selftests/rcutorture/configs/refperf/CFLIST
@@ -0,0 +1,2 @@
+NOPREEMPT
+PREEMPT
diff --git a/tools/testing/selftests/rcutorture/configs/refperf/CFcommon b/tools/testing/selftests/rcutorture/configs/refperf/CFcommon
new file mode 100644
index 000000000000..8ba5ba207503
--- /dev/null
+++ b/tools/testing/selftests/rcutorture/configs/refperf/CFcommon
@@ -0,0 +1,2 @@
+CONFIG_RCU_REF_PERF_TEST=y
+CONFIG_PRINTK_TIME=y
diff --git a/tools/testing/selftests/rcutorture/configs/refperf/NOPREEMPT b/tools/testing/selftests/rcutorture/configs/refperf/NOPREEMPT
new file mode 100644
index 000000000000..1cd25b7314e3
--- /dev/null
+++ b/tools/testing/selftests/rcutorture/configs/refperf/NOPREEMPT
@@ -0,0 +1,18 @@
+CONFIG_SMP=y
+CONFIG_PREEMPT_NONE=y
+CONFIG_PREEMPT_VOLUNTARY=n
+CONFIG_PREEMPT=n
+#CHECK#CONFIG_PREEMPT_RCU=n
+CONFIG_HZ_PERIODIC=n
+CONFIG_NO_HZ_IDLE=y
+CONFIG_NO_HZ_FULL=n
+CONFIG_RCU_FAST_NO_HZ=n
+CONFIG_HOTPLUG_CPU=n
+CONFIG_SUSPEND=n
+CONFIG_HIBERNATION=n
+CONFIG_RCU_NOCB_CPU=n
+CONFIG_DEBUG_LOCK_ALLOC=n
+CONFIG_PROVE_LOCKING=n
+CONFIG_RCU_BOOST=n
+CONFIG_DEBUG_OBJECTS_RCU_HEAD=n
+CONFIG_RCU_EXPERT=y
diff --git a/tools/testing/selftests/rcutorture/configs/refperf/PREEMPT b/tools/testing/selftests/rcutorture/configs/refperf/PREEMPT
new file mode 100644
index 000000000000..d10bc694f42c
--- /dev/null
+++ b/tools/testing/selftests/rcutorture/configs/refperf/PREEMPT
@@ -0,0 +1,18 @@
+CONFIG_SMP=y
+CONFIG_PREEMPT_NONE=n
+CONFIG_PREEMPT_VOLUNTARY=n
+CONFIG_PREEMPT=y
+#CHECK#CONFIG_PREEMPT_RCU=y
+CONFIG_HZ_PERIODIC=n
+CONFIG_NO_HZ_IDLE=y
+CONFIG_NO_HZ_FULL=n
+CONFIG_RCU_FAST_NO_HZ=n
+CONFIG_HOTPLUG_CPU=n
+CONFIG_SUSPEND=n
+CONFIG_HIBERNATION=n
+CONFIG_RCU_NOCB_CPU=n
+CONFIG_DEBUG_LOCK_ALLOC=n
+CONFIG_PROVE_LOCKING=n
+CONFIG_RCU_BOOST=n
+CONFIG_DEBUG_OBJECTS_RCU_HEAD=n
+CONFIG_RCU_EXPERT=y
diff --git a/tools/testing/selftests/rcutorture/configs/refperf/ver_functions.sh b/tools/testing/selftests/rcutorture/configs/refperf/ver_functions.sh
new file mode 100644
index 000000000000..489f05dd929a
--- /dev/null
+++ b/tools/testing/selftests/rcutorture/configs/refperf/ver_functions.sh
@@ -0,0 +1,16 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0+
+#
+# Torture-suite-dependent shell functions for the rest of the scripts.
+#
+# Copyright (C) IBM Corporation, 2015
+#
+# Authors: Paul E. McKenney <paulmck@linux.ibm.com>
+
+# per_version_boot_params bootparam-string config-file seconds
+#
+# Adds per-version torture-module parameters to kernels supporting them.
+per_version_boot_params () {
+	echo $1 refperf.shutdown=1 \
+		refperf.verbose=1
+}

From 777a54c908ec69fa0eccab54068a49ecda38ffde Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Mon, 25 May 2020 14:16:44 -0700
Subject: [PATCH 136/502] refperf: Add holdoff parameter to allow CPUs to come
 online

This commit adds an rcuperf module parameter named "holdoff" that
defaults to 10 seconds if refperf is built in and to zero otherwise.
The assumption is that all the CPUs are online by the time that the
modprobe and insmod commands are going to do anything, and that normal
systems will have all the CPUs online within ten seconds.

Larger systems may take many tens of seconds or even minutes to get
to this point, hence this being a module parameter instead of being a
hard-coded constant.

Cc: Joel Fernandes (Google) <joel@joelfernandes.org>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 kernel/rcu/refperf.c | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

diff --git a/kernel/rcu/refperf.c b/kernel/rcu/refperf.c
index 61161530acc8..4d686fdc3105 100644
--- a/kernel/rcu/refperf.c
+++ b/kernel/rcu/refperf.c
@@ -57,7 +57,10 @@ MODULE_PARM_DESC(perf_type, "Type of test (rcu, srcu, refcnt, rwsem, rwlock.");
 
 torture_param(int, verbose, 0, "Enable verbose debugging printk()s");
 
-// Number of loops per experiment, all readers execute an operation concurrently
+// Wait until there are multiple CPUs before starting test.
+torture_param(int, holdoff, IS_BUILTIN(CONFIG_RCU_REF_PERF_TEST) ? 10 : 0,
+	      "Holdoff time before test start (s)");
+// Number of loops per experiment, all readers execute operations concurrently.
 torture_param(long, loops, 10000000, "Number of loops per experiment.");
 
 #ifdef MODULE
@@ -248,6 +251,8 @@ ref_perf_reader(void *arg)
 	set_cpus_allowed_ptr(current, cpumask_of(me % nr_cpu_ids));
 	set_user_nice(current, MAX_NICE);
 	atomic_inc(&n_init);
+	if (holdoff)
+		schedule_timeout_interruptible(holdoff * HZ);
 repeat:
 	VERBOSE_PERFOUT("ref_perf_reader %ld: waiting to start next experiment on cpu %d", me, smp_processor_id());
 
@@ -357,6 +362,8 @@ static int main_func(void *arg)
 
 	// Wait for all threads to start.
 	wait_event(main_wq, atomic_read(&n_init) == (nreaders + 1));
+	if (holdoff)
+		schedule_timeout_interruptible(holdoff * HZ);
 
 	// Start exp readers up per experiment
 	for (exp = 0; exp < nreaders && !torture_must_stop(); exp++) {
@@ -420,8 +427,8 @@ static void
 ref_perf_print_module_parms(struct ref_perf_ops *cur_ops, const char *tag)
 {
 	pr_alert("%s" PERF_FLAG
-		 "--- %s:  verbose=%d shutdown=%d loops=%ld\n", perf_type, tag,
-		 verbose, shutdown, loops);
+		 "--- %s:  verbose=%d shutdown=%d holdoff=%d loops=%ld\n", perf_type, tag,
+		 verbose, shutdown, holdoff, loops);
 }
 
 static void

From 75dd8efef56ed5959c398974c785026f84aa0d1a Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Mon, 25 May 2020 14:59:06 -0700
Subject: [PATCH 137/502] refperf: Hoist function-pointer calls out of the loop

Current runs show PREEMPT=n rcu_read_lock()/rcu_read_unlock() pairs
consuming between 20 and 30 nanoseconds, when in fact the actual value is
zero, give or take the barrier() asm's effect on compiler optimizations.
The additional overhead is caused by function calls through pointers
(especially in these days of Spectre mitigations) and perhaps also
needless argument passing, a non-const loop limit, and an upcounting loop.

This commit therefore combines the ->readlock() and ->readunlock()
function pointers into a single ->readsection() function pointer that
takes the loop count as a const parameter and keeps any data passed
from the read-lock to the read-unlock internal to this new function.

These changes reduce the measured overhead of the aforementioned
PREEMPT=n rcu_read_lock()/rcu_read_unlock() pairs from between 20 and
30 nanoseconds to somewhere south of 500 picoseconds.

Cc: Joel Fernandes (Google) <joel@joelfernandes.org>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 kernel/rcu/refperf.c | 92 ++++++++++++++++++--------------------------
 1 file changed, 38 insertions(+), 54 deletions(-)

diff --git a/kernel/rcu/refperf.c b/kernel/rcu/refperf.c
index 4d686fdc3105..57c7b7a40bd2 100644
--- a/kernel/rcu/refperf.c
+++ b/kernel/rcu/refperf.c
@@ -108,23 +108,20 @@ static int exp_idx;
 struct ref_perf_ops {
 	void (*init)(void);
 	void (*cleanup)(void);
-	int (*readlock)(void);
-	void (*readunlock)(int idx);
+	void (*readsection)(const int nloops);
 	const char *name;
 };
 
 static struct ref_perf_ops *cur_ops;
 
-// Definitions for RCU ref perf testing.
-static int ref_rcu_read_lock(void) __acquires(RCU)
+static void ref_rcu_read_section(const int nloops)
 {
-	rcu_read_lock();
-	return 0;
-}
+	int i;
 
-static void ref_rcu_read_unlock(int idx) __releases(RCU)
-{
-	rcu_read_unlock();
+	for (i = nloops; i >= 0; i--) {
+		rcu_read_lock();
+		rcu_read_unlock();
+	}
 }
 
 static void rcu_sync_perf_init(void)
@@ -133,8 +130,7 @@ static void rcu_sync_perf_init(void)
 
 static struct ref_perf_ops rcu_ops = {
 	.init		= rcu_sync_perf_init,
-	.readlock	= ref_rcu_read_lock,
-	.readunlock	= ref_rcu_read_unlock,
+	.readsection	= ref_rcu_read_section,
 	.name		= "rcu"
 };
 
@@ -143,42 +139,39 @@ static struct ref_perf_ops rcu_ops = {
 DEFINE_STATIC_SRCU(srcu_refctl_perf);
 static struct srcu_struct *srcu_ctlp = &srcu_refctl_perf;
 
-static int srcu_ref_perf_read_lock(void) __acquires(srcu_ctlp)
+static void srcu_ref_perf_read_section(int nloops)
 {
-	return srcu_read_lock(srcu_ctlp);
-}
+	int i;
+	int idx;
 
-static void srcu_ref_perf_read_unlock(int idx) __releases(srcu_ctlp)
-{
-	srcu_read_unlock(srcu_ctlp, idx);
+	for (i = nloops; i >= 0; i--) {
+		idx = srcu_read_lock(srcu_ctlp);
+		srcu_read_unlock(srcu_ctlp, idx);
+	}
 }
 
 static struct ref_perf_ops srcu_ops = {
 	.init		= rcu_sync_perf_init,
-	.readlock	= srcu_ref_perf_read_lock,
-	.readunlock	= srcu_ref_perf_read_unlock,
+	.readsection	= srcu_ref_perf_read_section,
 	.name		= "srcu"
 };
 
 // Definitions for reference count
 static atomic_t refcnt;
 
-static int srcu_ref_perf_refcnt_lock(void)
+static void ref_perf_refcnt_section(const int nloops)
 {
-	atomic_inc(&refcnt);
-	return 0;
-}
+	int i;
 
-static void srcu_ref_perf_refcnt_unlock(int idx) __releases(srcu_ctlp)
-{
-	atomic_dec(&refcnt);
-	srcu_read_unlock(srcu_ctlp, idx);
+	for (i = nloops; i >= 0; i--) {
+		atomic_inc(&refcnt);
+		atomic_dec(&refcnt);
+	}
 }
 
 static struct ref_perf_ops refcnt_ops = {
 	.init		= rcu_sync_perf_init,
-	.readlock	= srcu_ref_perf_refcnt_lock,
-	.readunlock	= srcu_ref_perf_refcnt_unlock,
+	.readsection	= ref_perf_refcnt_section,
 	.name		= "refcnt"
 };
 
@@ -190,21 +183,19 @@ static void ref_perf_rwlock_init(void)
 	rwlock_init(&test_rwlock);
 }
 
-static int ref_perf_rwlock_lock(void)
+static void ref_perf_rwlock_section(const int nloops)
 {
-	read_lock(&test_rwlock);
-	return 0;
-}
+	int i;
 
-static void ref_perf_rwlock_unlock(int idx)
-{
-	read_unlock(&test_rwlock);
+	for (i = nloops; i >= 0; i--) {
+		read_lock(&test_rwlock);
+		read_unlock(&test_rwlock);
+	}
 }
 
 static struct ref_perf_ops rwlock_ops = {
 	.init		= ref_perf_rwlock_init,
-	.readlock	= ref_perf_rwlock_lock,
-	.readunlock	= ref_perf_rwlock_unlock,
+	.readsection	= ref_perf_rwlock_section,
 	.name		= "rwlock"
 };
 
@@ -216,21 +207,19 @@ static void ref_perf_rwsem_init(void)
 	init_rwsem(&test_rwsem);
 }
 
-static int ref_perf_rwsem_lock(void)
+static void ref_perf_rwsem_section(const int nloops)
 {
-	down_read(&test_rwsem);
-	return 0;
-}
+	int i;
 
-static void ref_perf_rwsem_unlock(int idx)
-{
-	up_read(&test_rwsem);
+	for (i = nloops; i >= 0; i--) {
+		down_read(&test_rwsem);
+		up_read(&test_rwsem);
+	}
 }
 
 static struct ref_perf_ops rwsem_ops = {
 	.init		= ref_perf_rwsem_init,
-	.readlock	= ref_perf_rwsem_lock,
-	.readunlock	= ref_perf_rwsem_unlock,
+	.readsection	= ref_perf_rwsem_section,
 	.name		= "rwsem"
 };
 
@@ -242,8 +231,6 @@ ref_perf_reader(void *arg)
 	unsigned long flags;
 	long me = (long)arg;
 	struct reader_task *rt = &(reader_tasks[me]);
-	unsigned long spincnt;
-	int idx;
 	u64 start;
 	s64 duration;
 
@@ -275,10 +262,7 @@ repeat:
 
 	VERBOSE_PERFOUT("ref_perf_reader %ld: experiment %d started", me, exp_idx);
 
-	for (spincnt = 0; spincnt < loops; spincnt++) {
-		idx = cur_ops->readlock();
-		cur_ops->readunlock(idx);
-	}
+	cur_ops->readsection(loops);
 
 	duration = ktime_get_mono_fast_ns() - start;
 	local_irq_restore(flags);

From 83b88c86da0e5f97faeac5a9bb19fe32f8c0394b Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Mon, 25 May 2020 15:31:07 -0700
Subject: [PATCH 138/502] refperf: Allow decimal nanoseconds

The CONFIG_PREEMPT=n rcu_read_lock()/rcu_read_unlock() pair's overhead,
even including loop overhead, is far less than one nanosecond.
Since logscale plots are not all that happy with zero values, provide
picoseconds as decimals.

Cc: Joel Fernandes (Google) <joel@joelfernandes.org>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 kernel/rcu/refperf.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/kernel/rcu/refperf.c b/kernel/rcu/refperf.c
index 57c7b7a40bd2..e991d4820f51 100644
--- a/kernel/rcu/refperf.c
+++ b/kernel/rcu/refperf.c
@@ -375,7 +375,7 @@ static int main_func(void *arg)
 		if (torture_must_stop())
 			goto end;
 
-		reader_tasks[exp].result_avg = process_durations(exp) / ((exp + 1) * loops);
+		reader_tasks[exp].result_avg = 1000 * process_durations(exp) / ((exp + 1) * loops);
 	}
 
 	// Print the average of all experiments
@@ -386,7 +386,7 @@ static int main_func(void *arg)
 	strcat(buf, "Threads\tTime(ns)\n");
 
 	for (exp = 0; exp < nreaders; exp++) {
-		sprintf(buf1, "%d\t%llu\n", exp + 1, reader_tasks[exp].result_avg);
+		sprintf(buf1, "%d\t%llu.%03d\n", exp + 1, reader_tasks[exp].result_avg / 1000, (int)(reader_tasks[exp].result_avg % 1000));
 		strcat(buf, buf1);
 	}
 

From 8fc28783a0c3704ea27505a25dbde8333d75380c Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Mon, 25 May 2020 15:48:38 -0700
Subject: [PATCH 139/502] refperf: Convert nreaders to a module parameter

This commit converts nreaders to a module parameter, with the default
of -1 specifying the old behavior of using 75% of the readers.

Cc: Joel Fernandes (Google) <joel@joelfernandes.org>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 kernel/rcu/refperf.c | 16 +++++++++++-----
 1 file changed, 11 insertions(+), 5 deletions(-)

diff --git a/kernel/rcu/refperf.c b/kernel/rcu/refperf.c
index e991d4820f51..020e55a9a64b 100644
--- a/kernel/rcu/refperf.c
+++ b/kernel/rcu/refperf.c
@@ -62,6 +62,12 @@ torture_param(int, holdoff, IS_BUILTIN(CONFIG_RCU_REF_PERF_TEST) ? 10 : 0,
 	      "Holdoff time before test start (s)");
 // Number of loops per experiment, all readers execute operations concurrently.
 torture_param(long, loops, 10000000, "Number of loops per experiment.");
+// Number of readers, with -1 defaulting to about 75% of the CPUs.
+torture_param(int, nreaders, -1, "Number of readers, -1 for 75% of CPUs.");
+// Number of runs.
+torture_param(int, nruns, 30, "Number of experiments to run.");
+// Reader delay in nanoseconds, 0 for no delay.
+torture_param(int, readdelay, 0, "Read-side delay in nanoseconds.");
 
 #ifdef MODULE
 # define REFPERF_SHUTDOWN 0
@@ -93,7 +99,6 @@ static wait_queue_head_t main_wq;
 static int shutdown_start;
 
 static struct reader_task *reader_tasks;
-static int nreaders;
 
 // Number of readers that are part of the current experiment.
 static atomic_t nreaders_exp;
@@ -411,8 +416,8 @@ static void
 ref_perf_print_module_parms(struct ref_perf_ops *cur_ops, const char *tag)
 {
 	pr_alert("%s" PERF_FLAG
-		 "--- %s:  verbose=%d shutdown=%d holdoff=%d loops=%ld\n", perf_type, tag,
-		 verbose, shutdown, holdoff, loops);
+		 "--- %s:  verbose=%d shutdown=%d holdoff=%d loops=%ld nreaders=%d\n", perf_type, tag,
+		 verbose, shutdown, holdoff, loops, nreaders);
 }
 
 static void
@@ -501,8 +506,9 @@ ref_perf_init(void)
 		schedule_timeout_uninterruptible(1);
 	}
 
-	// Reader tasks (~75% of online CPUs).
-	nreaders = (num_online_cpus() >> 1) + (num_online_cpus() >> 2);
+	// Reader tasks (default to ~75% of online CPUs).
+	if (nreaders < 0)
+		nreaders = (num_online_cpus() >> 1) + (num_online_cpus() >> 2);
 	reader_tasks = kcalloc(nreaders, sizeof(reader_tasks[0]),
 			       GFP_KERNEL);
 	if (!reader_tasks) {

From dbf28efdae7bb51032eeb0fe1b6bd07d6f0f9b6c Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Mon, 25 May 2020 17:22:24 -0700
Subject: [PATCH 140/502] refperf: Provide module parameter to specify number
 of experiments

The current code uses the number of threads both to limit the number
of threads and to specify the number of experiments, but also varies
the number of threads as the experiments progress.  This commit takes
a different approach by adding an refperf.nruns module parameter that
specifies the number of experiments, and furthermore uses the same
number of threads for each experiment.

Cc: Joel Fernandes (Google) <joel@joelfernandes.org>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 kernel/rcu/refperf.c | 43 +++++++++++++++++++++++--------------------
 1 file changed, 23 insertions(+), 20 deletions(-)

diff --git a/kernel/rcu/refperf.c b/kernel/rcu/refperf.c
index 020e55a9a64b..6324449db404 100644
--- a/kernel/rcu/refperf.c
+++ b/kernel/rcu/refperf.c
@@ -83,12 +83,6 @@ struct reader_task {
 	atomic_t start;
 	wait_queue_head_t wq;
 	u64 last_duration_ns;
-
-	// The average latency When 1..<this reader> are concurrently
-	// running an experiment. For example, if this reader_task is
-	// of index 5 in the reader_tasks array, then result is for
-	// 6 cores.
-	u64 result_avg;
 };
 
 static struct task_struct *shutdown_task;
@@ -289,12 +283,12 @@ end:
 	return 0;
 }
 
-void reset_readers(int n)
+void reset_readers(void)
 {
 	int i;
 	struct reader_task *rt;
 
-	for (i = 0; i < n; i++) {
+	for (i = 0; i < nreaders; i++) {
 		rt = &(reader_tasks[i]);
 
 		rt->last_duration_ns = 0;
@@ -314,7 +308,7 @@ u64 process_durations(int n)
 	sprintf(buf, "Experiment #%d (Format: <THREAD-NUM>:<Total loop time in ns>)",
 		exp_idx);
 
-	for (i = 0; i <= n && !torture_must_stop(); i++) {
+	for (i = 0; i < n && !torture_must_stop(); i++) {
 		rt = &(reader_tasks[i]);
 		sprintf(buf1, "%d: %llu\t", i, rt->last_duration_ns);
 
@@ -342,11 +336,15 @@ static int main_func(void *arg)
 	int exp, r;
 	char buf1[64];
 	char buf[512];
+	u64 *result_avg;
 
 	set_cpus_allowed_ptr(current, cpumask_of(nreaders % nr_cpu_ids));
 	set_user_nice(current, MAX_NICE);
 
 	VERBOSE_PERFOUT("main_func task started");
+	result_avg = kzalloc(nruns * sizeof(*result_avg), GFP_KERNEL);
+	if (!result_avg)
+		VERBOSE_PERFOUT_ERRSTRING("out of memory");
 	atomic_inc(&n_init);
 
 	// Wait for all threads to start.
@@ -355,22 +353,24 @@ static int main_func(void *arg)
 		schedule_timeout_interruptible(holdoff * HZ);
 
 	// Start exp readers up per experiment
-	for (exp = 0; exp < nreaders && !torture_must_stop(); exp++) {
+	for (exp = 0; exp < nruns && !torture_must_stop(); exp++) {
+		if (!result_avg)
+			break;
 		if (torture_must_stop())
 			goto end;
 
-		reset_readers(exp);
-		atomic_set(&nreaders_exp, exp + 1);
+		reset_readers();
+		atomic_set(&nreaders_exp, nreaders);
 
 		exp_idx = exp;
 
-		for (r = 0; r <= exp; r++) {
+		for (r = 0; r < nreaders; r++) {
 			atomic_set(&reader_tasks[r].start, 1);
 			wake_up(&reader_tasks[r].wq);
 		}
 
 		VERBOSE_PERFOUT("main_func: experiment started, waiting for %d readers",
-				exp);
+				nreaders);
 
 		wait_event(main_wq,
 			   !atomic_read(&nreaders_exp) || torture_must_stop());
@@ -380,7 +380,7 @@ static int main_func(void *arg)
 		if (torture_must_stop())
 			goto end;
 
-		reader_tasks[exp].result_avg = 1000 * process_durations(exp) / ((exp + 1) * loops);
+		result_avg[exp] = 1000 * process_durations(nreaders) / (nreaders * loops);
 	}
 
 	// Print the average of all experiments
@@ -390,12 +390,15 @@ static int main_func(void *arg)
 	strcat(buf, "\n");
 	strcat(buf, "Threads\tTime(ns)\n");
 
-	for (exp = 0; exp < nreaders; exp++) {
-		sprintf(buf1, "%d\t%llu.%03d\n", exp + 1, reader_tasks[exp].result_avg / 1000, (int)(reader_tasks[exp].result_avg % 1000));
+	for (exp = 0; exp < nruns; exp++) {
+		if (!result_avg)
+			break;
+		sprintf(buf1, "%d\t%llu.%03d\n", exp + 1, result_avg[exp] / 1000, (int)(result_avg[exp] % 1000));
 		strcat(buf, buf1);
 	}
 
-	PERFOUT("%s", buf);
+	if (result_avg)
+		PERFOUT("%s", buf);
 
 	// This will shutdown everything including us.
 	if (shutdown) {
@@ -416,8 +419,8 @@ static void
 ref_perf_print_module_parms(struct ref_perf_ops *cur_ops, const char *tag)
 {
 	pr_alert("%s" PERF_FLAG
-		 "--- %s:  verbose=%d shutdown=%d holdoff=%d loops=%ld nreaders=%d\n", perf_type, tag,
-		 verbose, shutdown, holdoff, loops, nreaders);
+		 "--- %s:  verbose=%d shutdown=%d holdoff=%d loops=%ld nreaders=%d nruns=%d\n", perf_type, tag,
+		 verbose, shutdown, holdoff, loops, nreaders, nruns);
 }
 
 static void

From f518f154ecef347777db33b7c9b0581f245159f0 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Mon, 25 May 2020 17:32:56 -0700
Subject: [PATCH 141/502] refperf: Dynamically allocate experiment-summary
 output buffer

Currently, the buffer used to accumulate the experiment-summary output
is fixed size, which will cause problems if someone decides to run
one hundred experiments.  This commit therefore dynamically allocates
this buffer.

Cc: Joel Fernandes (Google) <joel@joelfernandes.org>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 kernel/rcu/refperf.c | 16 +++++++++++-----
 1 file changed, 11 insertions(+), 5 deletions(-)

diff --git a/kernel/rcu/refperf.c b/kernel/rcu/refperf.c
index 6324449db404..75b9cceaece1 100644
--- a/kernel/rcu/refperf.c
+++ b/kernel/rcu/refperf.c
@@ -333,9 +333,10 @@ u64 process_durations(int n)
 // point all the timestamps are printed.
 static int main_func(void *arg)
 {
+	bool errexit = false;
 	int exp, r;
 	char buf1[64];
-	char buf[512];
+	char *buf;
 	u64 *result_avg;
 
 	set_cpus_allowed_ptr(current, cpumask_of(nreaders % nr_cpu_ids));
@@ -343,8 +344,11 @@ static int main_func(void *arg)
 
 	VERBOSE_PERFOUT("main_func task started");
 	result_avg = kzalloc(nruns * sizeof(*result_avg), GFP_KERNEL);
-	if (!result_avg)
+	buf = kzalloc(64 + nruns * 32, GFP_KERNEL);
+	if (!result_avg || !buf) {
 		VERBOSE_PERFOUT_ERRSTRING("out of memory");
+		errexit = true;
+	}
 	atomic_inc(&n_init);
 
 	// Wait for all threads to start.
@@ -354,7 +358,7 @@ static int main_func(void *arg)
 
 	// Start exp readers up per experiment
 	for (exp = 0; exp < nruns && !torture_must_stop(); exp++) {
-		if (!result_avg)
+		if (errexit)
 			break;
 		if (torture_must_stop())
 			goto end;
@@ -391,13 +395,13 @@ static int main_func(void *arg)
 	strcat(buf, "Threads\tTime(ns)\n");
 
 	for (exp = 0; exp < nruns; exp++) {
-		if (!result_avg)
+		if (errexit)
 			break;
 		sprintf(buf1, "%d\t%llu.%03d\n", exp + 1, result_avg[exp] / 1000, (int)(result_avg[exp] % 1000));
 		strcat(buf, buf1);
 	}
 
-	if (result_avg)
+	if (!errexit)
 		PERFOUT("%s", buf);
 
 	// This will shutdown everything including us.
@@ -412,6 +416,8 @@ static int main_func(void *arg)
 
 end:
 	torture_kthread_stopping("main_func");
+	kfree(result_avg);
+	kfree(buf);
 	return 0;
 }
 

From 2e90de76f226f11fe26c871aa321be28152f565a Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Mon, 25 May 2020 17:45:03 -0700
Subject: [PATCH 142/502] refperf: Dynamically allocate thread-summary output
 buffer

Currently, the buffer used to accumulate the thread-summary output is
fixed size, which will cause problems if someone decides to run on a large
number of PCUs.  This commit therefore dynamically allocates this buffer.

[ paulmck: Fix memory allocation as suggested by KASAN. ]
Cc: Joel Fernandes (Google) <joel@joelfernandes.org>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 kernel/rcu/refperf.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/kernel/rcu/refperf.c b/kernel/rcu/refperf.c
index 75b9cceaece1..fc940e3dba1f 100644
--- a/kernel/rcu/refperf.c
+++ b/kernel/rcu/refperf.c
@@ -301,9 +301,12 @@ u64 process_durations(int n)
 	int i;
 	struct reader_task *rt;
 	char buf1[64];
-	char buf[512];
+	char *buf;
 	u64 sum = 0;
 
+	buf = kmalloc(128 + nreaders * 32, GFP_KERNEL);
+	if (!buf)
+		return 0;
 	buf[0] = 0;
 	sprintf(buf, "Experiment #%d (Format: <THREAD-NUM>:<Total loop time in ns>)",
 		exp_idx);
@@ -322,6 +325,7 @@ u64 process_durations(int n)
 
 	PERFOUT("%s\n", buf);
 
+	kfree(buf);
 	return sum;
 }
 

From 2990750bceb05c3cdeae3a6d2683cbc4ae4de15e Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Tue, 26 May 2020 09:32:57 -0700
Subject: [PATCH 143/502] refperf: Make functions static

Because the reset_readers() and process_durations() functions are used
only within kernel/rcu/refperf.c, this commit makes them static.

Reported-by: kbuild test robot <lkp@intel.com>
Cc: Joel Fernandes (Google) <joel@joelfernandes.org>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 kernel/rcu/refperf.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/kernel/rcu/refperf.c b/kernel/rcu/refperf.c
index fc940e3dba1f..0a900f3ae151 100644
--- a/kernel/rcu/refperf.c
+++ b/kernel/rcu/refperf.c
@@ -283,7 +283,7 @@ end:
 	return 0;
 }
 
-void reset_readers(void)
+static void reset_readers(void)
 {
 	int i;
 	struct reader_task *rt;
@@ -296,7 +296,7 @@ void reset_readers(void)
 }
 
 // Print the results of each reader and return the sum of all their durations.
-u64 process_durations(int n)
+static u64 process_durations(int n)
 {
 	int i;
 	struct reader_task *rt;

From b864f89ff61492f56b4e8c6713a5efec6540a0e2 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Tue, 26 May 2020 10:57:34 -0700
Subject: [PATCH 144/502] refperf: Tune reader measurement interval

This commit moves a printk() out of the measurement interval, converts
a atomic_dec()/atomic_read() pair to atomic_dec_and_test(), and adds
a smp_mb__before_atomic() to avoid potential wake/wait hangs.  These
changes have the added benefit of reducing the number of loops required
for amortizing loop overhead for CONFIG_PREEMPT=n RCU measurements from
1,000,000 to 10,000.  This reduction in turn shortens the test, reducing
the probability of interference.

Cc: Joel Fernandes (Google) <joel@joelfernandes.org>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 kernel/rcu/refperf.c | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/kernel/rcu/refperf.c b/kernel/rcu/refperf.c
index 0a900f3ae151..8815ccfb6f98 100644
--- a/kernel/rcu/refperf.c
+++ b/kernel/rcu/refperf.c
@@ -252,15 +252,16 @@ repeat:
 	// Make sure that the CPU is affinitized appropriately during testing.
 	WARN_ON_ONCE(smp_processor_id() != me);
 
+	smp_mb__before_atomic();
 	atomic_dec(&rt->start);
 
+	VERBOSE_PERFOUT("ref_perf_reader %ld: experiment %d started", me, exp_idx);
+
 	// To prevent noise, keep interrupts disabled. This also has the
 	// effect of preventing entries into slow path for rcu_read_unlock().
 	local_irq_save(flags);
 	start = ktime_get_mono_fast_ns();
 
-	VERBOSE_PERFOUT("ref_perf_reader %ld: experiment %d started", me, exp_idx);
-
 	cur_ops->readsection(loops);
 
 	duration = ktime_get_mono_fast_ns() - start;
@@ -268,14 +269,12 @@ repeat:
 
 	rt->last_duration_ns = WARN_ON_ONCE(duration < 0) ? 0 : duration;
 
-	atomic_dec(&nreaders_exp);
+	if (atomic_dec_and_test(&nreaders_exp))
+		wake_up(&main_wq);
 
 	VERBOSE_PERFOUT("ref_perf_reader %ld: experiment %d ended, (readers remaining=%d)",
 			me, exp_idx, atomic_read(&nreaders_exp));
 
-	if (!atomic_read(&nreaders_exp))
-		wake_up(&main_wq);
-
 	if (!torture_must_stop())
 		goto repeat;
 end:

From af2789db13b8dc38d16e969f8c11b9468be42d46 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Tue, 26 May 2020 11:22:03 -0700
Subject: [PATCH 145/502] refperf: Convert reader_task structure's "start"
 field to int

This commit converts the reader_task structure's "start" field to int
in order to demote a full barrier to an smp_load_acquire() and also to
simplify the code a bit.  While in the area, and to enlist the compiler's
help in ensuring that nothing was missed, the field's name was changed
to start_reader.

Also while in the area, change the main_func() store to use
smp_store_release() to further fortify against wait/wake races.

Cc: Joel Fernandes (Google) <joel@joelfernandes.org>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 kernel/rcu/refperf.c | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/kernel/rcu/refperf.c b/kernel/rcu/refperf.c
index 8815ccfb6f98..2fd3ed1a0d0d 100644
--- a/kernel/rcu/refperf.c
+++ b/kernel/rcu/refperf.c
@@ -80,7 +80,7 @@ torture_param(bool, shutdown, REFPERF_SHUTDOWN,
 
 struct reader_task {
 	struct task_struct *task;
-	atomic_t start;
+	int start_reader;
 	wait_queue_head_t wq;
 	u64 last_duration_ns;
 };
@@ -243,7 +243,7 @@ repeat:
 	VERBOSE_PERFOUT("ref_perf_reader %ld: waiting to start next experiment on cpu %d", me, smp_processor_id());
 
 	// Wait for signal that this reader can start.
-	wait_event(rt->wq, (atomic_read(&nreaders_exp) && atomic_read(&rt->start)) ||
+	wait_event(rt->wq, (atomic_read(&nreaders_exp) && smp_load_acquire(&rt->start_reader)) ||
 			   torture_must_stop());
 
 	if (torture_must_stop())
@@ -252,8 +252,7 @@ repeat:
 	// Make sure that the CPU is affinitized appropriately during testing.
 	WARN_ON_ONCE(smp_processor_id() != me);
 
-	smp_mb__before_atomic();
-	atomic_dec(&rt->start);
+	WRITE_ONCE(rt->start_reader, 0);
 
 	VERBOSE_PERFOUT("ref_perf_reader %ld: experiment %d started", me, exp_idx);
 
@@ -372,7 +371,7 @@ static int main_func(void *arg)
 		exp_idx = exp;
 
 		for (r = 0; r < nreaders; r++) {
-			atomic_set(&reader_tasks[r].start, 1);
+			smp_store_release(&reader_tasks[r].start_reader, 1);
 			wake_up(&reader_tasks[r].wq);
 		}
 

From 86e0da2bb8ed934d3dce5a337895f1118f59c087 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Tue, 26 May 2020 11:40:52 -0700
Subject: [PATCH 146/502] refperf: More closely synchronize reader start times

Currently, readers are awakened individually.  On most systems, this
results in significant wakeup delay from one reader to the next, which
can result in the first and last reader having sole access to the
synchronization primitive in question.  If that synchronization primitive
involves shared memory, those readers will rack up a huge number of
operations in a very short time, causing large perturbations in the
results.

This commit therefore has the readers busy-wait after being awakened,
and uses a new n_started variable to synchronize their start times.

Cc: Joel Fernandes (Google) <joel@joelfernandes.org>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 kernel/rcu/refperf.c | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/kernel/rcu/refperf.c b/kernel/rcu/refperf.c
index 2fd3ed1a0d0d..234bb0e84a8b 100644
--- a/kernel/rcu/refperf.c
+++ b/kernel/rcu/refperf.c
@@ -99,6 +99,7 @@ static atomic_t nreaders_exp;
 
 // Use to wait for all threads to start.
 static atomic_t n_init;
+static atomic_t n_started;
 
 // Track which experiment is currently running.
 static int exp_idx;
@@ -253,6 +254,9 @@ repeat:
 	WARN_ON_ONCE(smp_processor_id() != me);
 
 	WRITE_ONCE(rt->start_reader, 0);
+	if (!atomic_dec_return(&n_started))
+		while (atomic_read_acquire(&n_started))
+			cpu_relax();
 
 	VERBOSE_PERFOUT("ref_perf_reader %ld: experiment %d started", me, exp_idx);
 
@@ -367,6 +371,7 @@ static int main_func(void *arg)
 
 		reset_readers();
 		atomic_set(&nreaders_exp, nreaders);
+		atomic_set(&n_started, nreaders);
 
 		exp_idx = exp;
 

From 2db0bda38453f472640f4ece1e2a495cbd44f892 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Tue, 26 May 2020 12:34:57 -0700
Subject: [PATCH 147/502] refperf: Add warmup and cooldown processing phases

This commit causes all the readers to start running unmeasured load
until all readers have done at least one such run (thus having warmed
up), then run the measured load, and then run unmeasured load until all
readers have completed their measured load.  This approach avoids any
thread running measured load while other readers are idle.

Cc: Joel Fernandes (Google) <joel@joelfernandes.org>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 kernel/rcu/refperf.c | 20 ++++++++++++++++++--
 1 file changed, 18 insertions(+), 2 deletions(-)

diff --git a/kernel/rcu/refperf.c b/kernel/rcu/refperf.c
index 234bb0e84a8b..445190b97b05 100644
--- a/kernel/rcu/refperf.c
+++ b/kernel/rcu/refperf.c
@@ -100,6 +100,8 @@ static atomic_t nreaders_exp;
 // Use to wait for all threads to start.
 static atomic_t n_init;
 static atomic_t n_started;
+static atomic_t n_warmedup;
+static atomic_t n_cooleddown;
 
 // Track which experiment is currently running.
 static int exp_idx;
@@ -260,8 +262,15 @@ repeat:
 
 	VERBOSE_PERFOUT("ref_perf_reader %ld: experiment %d started", me, exp_idx);
 
-	// To prevent noise, keep interrupts disabled. This also has the
-	// effect of preventing entries into slow path for rcu_read_unlock().
+
+	// To reduce noise, do an initial cache-warming invocation, check
+	// in, and then keep warming until everyone has checked in.
+	cur_ops->readsection(loops);
+	if (!atomic_dec_return(&n_warmedup))
+		while (atomic_read_acquire(&n_warmedup))
+			cur_ops->readsection(loops);
+	// Also keep interrupts disabled.  This also has the effect
+	// of preventing entries into slow path for rcu_read_unlock().
 	local_irq_save(flags);
 	start = ktime_get_mono_fast_ns();
 
@@ -271,6 +280,11 @@ repeat:
 	local_irq_restore(flags);
 
 	rt->last_duration_ns = WARN_ON_ONCE(duration < 0) ? 0 : duration;
+	// To reduce runtime-skew noise, do maintain-load invocations until
+	// everyone is done.
+	if (!atomic_dec_return(&n_cooleddown))
+		while (atomic_read_acquire(&n_cooleddown))
+			cur_ops->readsection(loops);
 
 	if (atomic_dec_and_test(&nreaders_exp))
 		wake_up(&main_wq);
@@ -372,6 +386,8 @@ static int main_func(void *arg)
 		reset_readers();
 		atomic_set(&nreaders_exp, nreaders);
 		atomic_set(&n_started, nreaders);
+		atomic_set(&n_warmedup, nreaders);
+		atomic_set(&n_cooleddown, nreaders);
 
 		exp_idx = exp;
 

From 6efb06340846c788336f402e3a472a24fabb431e Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Tue, 26 May 2020 14:26:25 -0700
Subject: [PATCH 148/502] refperf: Label experiment-number column "Runs"

The experiment-number column is currently labeled "Threads", which is
misleading at best.  This commit therefore relabels it as "Runs", and
adjusts the scripts accordingly.

Cc: Joel Fernandes (Google) <joel@joelfernandes.org>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 kernel/rcu/refperf.c                                          | 2 +-
 tools/testing/selftests/rcutorture/bin/kvm-recheck-refperf.sh | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/kernel/rcu/refperf.c b/kernel/rcu/refperf.c
index 445190b97b05..2d2d227d761a 100644
--- a/kernel/rcu/refperf.c
+++ b/kernel/rcu/refperf.c
@@ -415,7 +415,7 @@ static int main_func(void *arg)
 
 	buf[0] = 0;
 	strcat(buf, "\n");
-	strcat(buf, "Threads\tTime(ns)\n");
+	strcat(buf, "Runs\tTime(ns)\n");
 
 	for (exp = 0; exp < nruns; exp++) {
 		if (errexit)
diff --git a/tools/testing/selftests/rcutorture/bin/kvm-recheck-refperf.sh b/tools/testing/selftests/rcutorture/bin/kvm-recheck-refperf.sh
index 6fc06cd3538e..0660f3fab215 100755
--- a/tools/testing/selftests/rcutorture/bin/kvm-recheck-refperf.sh
+++ b/tools/testing/selftests/rcutorture/bin/kvm-recheck-refperf.sh
@@ -24,7 +24,7 @@ configfile=`echo $i | sed -e 's/^.*\///'`
 
 sed -e 's/^\[[^]]*]//' < $i/console.log | tr -d '\015' |
 awk -v configfile="$configfile" '
-/^[ 	]*Threads	Time\(ns\) *$/ {
+/^[ 	]*Runs	Time\(ns\) *$/ {
 	if (dataphase + 0 == 0) {
 		dataphase = 1;
 		# print configfile, $0;

From 9d1914d34cebe111a23ab1670633900fd770cec3 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Tue, 26 May 2020 15:30:09 -0700
Subject: [PATCH 149/502] refperf: Output per-experiment data points

Currently, it is necessary to manually edit the console output to see
anything more than statistics, and sometimes the statistics can indicate
outliers that need more investigation.  This commit therefore dumps out
the per-experiment measurements, sorted in ascending order, just before
dumping out the statistics.

Cc: Joel Fernandes (Google) <joel@joelfernandes.org>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 tools/testing/selftests/rcutorture/bin/kvm-recheck-refperf.sh | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/tools/testing/selftests/rcutorture/bin/kvm-recheck-refperf.sh b/tools/testing/selftests/rcutorture/bin/kvm-recheck-refperf.sh
index 0660f3fab215..0e29cfd9986c 100755
--- a/tools/testing/selftests/rcutorture/bin/kvm-recheck-refperf.sh
+++ b/tools/testing/selftests/rcutorture/bin/kvm-recheck-refperf.sh
@@ -59,6 +59,10 @@ END {
 		medianvalue = (readertimes[medianidx - 1] + readertimes[medianidx]) / 2;
 	else
 		medianvalue = readertimes[medianidx];
+	points = "Points:";
+	for (i = 1; i <= newNR; i++)
+		points = points " " readertimes[i];
+	print points;
 	print "Average reader duration: " sum / newNR " nanoseconds";
 	print "Minimum reader duration: " readertimes[1];
 	print "Median reader duration: " medianvalue;

From 96af8669591d740a1e2695c4d96e544409dbf896 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Wed, 27 May 2020 16:46:56 -0700
Subject: [PATCH 150/502] refperf: Simplify initialization-time wakeup protocol

This commit moves the reader-launch wait loop from ref_perf_init()
to main_func(), removing one layer of wakeup and allowing slightly
faster system boot.

Cc: Joel Fernandes (Google) <joel@joelfernandes.org>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 kernel/rcu/refperf.c | 17 +++++------------
 1 file changed, 5 insertions(+), 12 deletions(-)

diff --git a/kernel/rcu/refperf.c b/kernel/rcu/refperf.c
index 2d2d227d761a..7839237ffc17 100644
--- a/kernel/rcu/refperf.c
+++ b/kernel/rcu/refperf.c
@@ -369,13 +369,14 @@ static int main_func(void *arg)
 		VERBOSE_PERFOUT_ERRSTRING("out of memory");
 		errexit = true;
 	}
-	atomic_inc(&n_init);
-
-	// Wait for all threads to start.
-	wait_event(main_wq, atomic_read(&n_init) == (nreaders + 1));
 	if (holdoff)
 		schedule_timeout_interruptible(holdoff * HZ);
 
+	// Wait for all threads to start.
+	atomic_inc(&n_init);
+	while (atomic_read(&n_init) < nreaders + 1)
+		schedule_timeout_uninterruptible(1);
+
 	// Start exp readers up per experiment
 	for (exp = 0; exp < nruns && !torture_must_stop(); exp++) {
 		if (errexit)
@@ -565,14 +566,6 @@ ref_perf_init(void)
 	firsterr = torture_create_kthread(main_func, NULL, main_task);
 	if (firsterr)
 		goto unwind;
-	schedule_timeout_uninterruptible(1);
-
-
-	// Wait until all threads start
-	while (atomic_read(&n_init) < nreaders + 1)
-		schedule_timeout_uninterruptible(1);
-
-	wake_up(&main_wq);
 
 	torture_init_end();
 	return 0;

From b4d1e34f6502a138e32275baabdb6d593d7ea432 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Thu, 28 May 2020 16:37:35 -0700
Subject: [PATCH 151/502] refperf: Add read-side delay module parameter

This commit adds a refperf.readdelay module parameter that controls the
duration of each critical section.  This parameter allows gathering data
showing how the performance differences between the various primitives
vary with critical-section length.

Cc: Joel Fernandes (Google) <joel@joelfernandes.org>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 kernel/rcu/refperf.c | 108 +++++++++++++++++++++++++++++++++++--------
 1 file changed, 89 insertions(+), 19 deletions(-)

diff --git a/kernel/rcu/refperf.c b/kernel/rcu/refperf.c
index 7839237ffc17..57a750bbcaca 100644
--- a/kernel/rcu/refperf.c
+++ b/kernel/rcu/refperf.c
@@ -66,8 +66,8 @@ torture_param(long, loops, 10000000, "Number of loops per experiment.");
 torture_param(int, nreaders, -1, "Number of readers, -1 for 75% of CPUs.");
 // Number of runs.
 torture_param(int, nruns, 30, "Number of experiments to run.");
-// Reader delay in nanoseconds, 0 for no delay.
-torture_param(int, readdelay, 0, "Read-side delay in nanoseconds.");
+// Reader delay in microseconds, 0 for no delay.
+torture_param(int, readdelay, 0, "Read-side delay in microseconds.");
 
 #ifdef MODULE
 # define REFPERF_SHUTDOWN 0
@@ -111,6 +111,7 @@ struct ref_perf_ops {
 	void (*init)(void);
 	void (*cleanup)(void);
 	void (*readsection)(const int nloops);
+	void (*delaysection)(const int nloops, const int ndelay);
 	const char *name;
 };
 
@@ -126,6 +127,17 @@ static void ref_rcu_read_section(const int nloops)
 	}
 }
 
+static void ref_rcu_delay_section(const int nloops, const int ndelay)
+{
+	int i;
+
+	for (i = nloops; i >= 0; i--) {
+		rcu_read_lock();
+		udelay(ndelay);
+		rcu_read_unlock();
+	}
+}
+
 static void rcu_sync_perf_init(void)
 {
 }
@@ -133,6 +145,7 @@ static void rcu_sync_perf_init(void)
 static struct ref_perf_ops rcu_ops = {
 	.init		= rcu_sync_perf_init,
 	.readsection	= ref_rcu_read_section,
+	.delaysection	= ref_rcu_delay_section,
 	.name		= "rcu"
 };
 
@@ -141,7 +154,7 @@ static struct ref_perf_ops rcu_ops = {
 DEFINE_STATIC_SRCU(srcu_refctl_perf);
 static struct srcu_struct *srcu_ctlp = &srcu_refctl_perf;
 
-static void srcu_ref_perf_read_section(int nloops)
+static void srcu_ref_perf_read_section(const int nloops)
 {
 	int i;
 	int idx;
@@ -152,16 +165,29 @@ static void srcu_ref_perf_read_section(int nloops)
 	}
 }
 
+static void srcu_ref_perf_delay_section(const int nloops, const int ndelay)
+{
+	int i;
+	int idx;
+
+	for (i = nloops; i >= 0; i--) {
+		idx = srcu_read_lock(srcu_ctlp);
+		udelay(ndelay);
+		srcu_read_unlock(srcu_ctlp, idx);
+	}
+}
+
 static struct ref_perf_ops srcu_ops = {
 	.init		= rcu_sync_perf_init,
 	.readsection	= srcu_ref_perf_read_section,
+	.delaysection	= srcu_ref_perf_delay_section,
 	.name		= "srcu"
 };
 
 // Definitions for reference count
 static atomic_t refcnt;
 
-static void ref_perf_refcnt_section(const int nloops)
+static void ref_refcnt_section(const int nloops)
 {
 	int i;
 
@@ -171,21 +197,33 @@ static void ref_perf_refcnt_section(const int nloops)
 	}
 }
 
+static void ref_refcnt_delay_section(const int nloops, const int ndelay)
+{
+	int i;
+
+	for (i = nloops; i >= 0; i--) {
+		atomic_inc(&refcnt);
+		udelay(ndelay);
+		atomic_dec(&refcnt);
+	}
+}
+
 static struct ref_perf_ops refcnt_ops = {
 	.init		= rcu_sync_perf_init,
-	.readsection	= ref_perf_refcnt_section,
+	.readsection	= ref_refcnt_section,
+	.delaysection	= ref_refcnt_delay_section,
 	.name		= "refcnt"
 };
 
 // Definitions for rwlock
 static rwlock_t test_rwlock;
 
-static void ref_perf_rwlock_init(void)
+static void ref_rwlock_init(void)
 {
 	rwlock_init(&test_rwlock);
 }
 
-static void ref_perf_rwlock_section(const int nloops)
+static void ref_rwlock_section(const int nloops)
 {
 	int i;
 
@@ -195,21 +233,33 @@ static void ref_perf_rwlock_section(const int nloops)
 	}
 }
 
+static void ref_rwlock_delay_section(const int nloops, const int ndelay)
+{
+	int i;
+
+	for (i = nloops; i >= 0; i--) {
+		read_lock(&test_rwlock);
+		udelay(ndelay);
+		read_unlock(&test_rwlock);
+	}
+}
+
 static struct ref_perf_ops rwlock_ops = {
-	.init		= ref_perf_rwlock_init,
-	.readsection	= ref_perf_rwlock_section,
+	.init		= ref_rwlock_init,
+	.readsection	= ref_rwlock_section,
+	.delaysection	= ref_rwlock_delay_section,
 	.name		= "rwlock"
 };
 
 // Definitions for rwsem
 static struct rw_semaphore test_rwsem;
 
-static void ref_perf_rwsem_init(void)
+static void ref_rwsem_init(void)
 {
 	init_rwsem(&test_rwsem);
 }
 
-static void ref_perf_rwsem_section(const int nloops)
+static void ref_rwsem_section(const int nloops)
 {
 	int i;
 
@@ -219,12 +269,32 @@ static void ref_perf_rwsem_section(const int nloops)
 	}
 }
 
+static void ref_rwsem_delay_section(const int nloops, const int ndelay)
+{
+	int i;
+
+	for (i = nloops; i >= 0; i--) {
+		down_read(&test_rwsem);
+		udelay(ndelay);
+		up_read(&test_rwsem);
+	}
+}
+
 static struct ref_perf_ops rwsem_ops = {
-	.init		= ref_perf_rwsem_init,
-	.readsection	= ref_perf_rwsem_section,
+	.init		= ref_rwsem_init,
+	.readsection	= ref_rwsem_section,
+	.delaysection	= ref_rwsem_delay_section,
 	.name		= "rwsem"
 };
 
+static void rcu_perf_one_reader(void)
+{
+	if (readdelay <= 0)
+		cur_ops->readsection(loops);
+	else
+		cur_ops->delaysection(loops, readdelay);
+}
+
 // Reader kthread.  Repeatedly does empty RCU read-side
 // critical section, minimizing update-side interference.
 static int
@@ -265,16 +335,16 @@ repeat:
 
 	// To reduce noise, do an initial cache-warming invocation, check
 	// in, and then keep warming until everyone has checked in.
-	cur_ops->readsection(loops);
+	rcu_perf_one_reader();
 	if (!atomic_dec_return(&n_warmedup))
 		while (atomic_read_acquire(&n_warmedup))
-			cur_ops->readsection(loops);
+			rcu_perf_one_reader();
 	// Also keep interrupts disabled.  This also has the effect
 	// of preventing entries into slow path for rcu_read_unlock().
 	local_irq_save(flags);
 	start = ktime_get_mono_fast_ns();
 
-	cur_ops->readsection(loops);
+	rcu_perf_one_reader();
 
 	duration = ktime_get_mono_fast_ns() - start;
 	local_irq_restore(flags);
@@ -284,7 +354,7 @@ repeat:
 	// everyone is done.
 	if (!atomic_dec_return(&n_cooleddown))
 		while (atomic_read_acquire(&n_cooleddown))
-			cur_ops->readsection(loops);
+			rcu_perf_one_reader();
 
 	if (atomic_dec_and_test(&nreaders_exp))
 		wake_up(&main_wq);
@@ -449,8 +519,8 @@ static void
 ref_perf_print_module_parms(struct ref_perf_ops *cur_ops, const char *tag)
 {
 	pr_alert("%s" PERF_FLAG
-		 "--- %s:  verbose=%d shutdown=%d holdoff=%d loops=%ld nreaders=%d nruns=%d\n", perf_type, tag,
-		 verbose, shutdown, holdoff, loops, nreaders, nruns);
+		 "--- %s:  verbose=%d shutdown=%d holdoff=%d loops=%ld nreaders=%d nruns=%d readdelay=%d\n", perf_type, tag,
+		 verbose, shutdown, holdoff, loops, nreaders, nruns, readdelay);
 }
 
 static void

From 4dd72a338a07486823037a6b45334d05192c913a Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Fri, 29 May 2020 13:11:26 -0700
Subject: [PATCH 152/502] refperf: Adjust refperf.loop default value

With the various measurement optimizations, 10,000 loops normally
suffices.  This commit therefore reduces the refperf.loops default value
from 10,000,000 to 10,000.

Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 kernel/rcu/refperf.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/rcu/refperf.c b/kernel/rcu/refperf.c
index 57a750bbcaca..063eeb0473a1 100644
--- a/kernel/rcu/refperf.c
+++ b/kernel/rcu/refperf.c
@@ -61,7 +61,7 @@ torture_param(int, verbose, 0, "Enable verbose debugging printk()s");
 torture_param(int, holdoff, IS_BUILTIN(CONFIG_RCU_REF_PERF_TEST) ? 10 : 0,
 	      "Holdoff time before test start (s)");
 // Number of loops per experiment, all readers execute operations concurrently.
-torture_param(long, loops, 10000000, "Number of loops per experiment.");
+torture_param(long, loops, 10000, "Number of loops per experiment.");
 // Number of readers, with -1 defaulting to about 75% of the CPUs.
 torture_param(int, nreaders, -1, "Number of readers, -1 for 75% of CPUs.");
 // Number of runs.

From 847dd70aa971a67b4dfdb8f131428dfb90d88714 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Fri, 29 May 2020 14:24:03 -0700
Subject: [PATCH 153/502] doc: Document rcuperf's module parameters

This commit adds documentation for the rcuperf module parameters.

Cc: Joel Fernandes (Google) <joel@joelfernandes.org>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 .../admin-guide/kernel-parameters.txt         | 36 +++++++++++++++++++
 1 file changed, 36 insertions(+)

diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index fb95fad81c79..20cd00b78fc4 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -4407,6 +4407,42 @@
 			      reboot_cpu is s[mp]#### with #### being the processor
 					to be used for rebooting.
 
+	refperf.holdoff= [KNL]
+			Set test-start holdoff period.  The purpose of
+			this parameter is to delay the start of the
+			test until boot completes in order to avoid
+			interference.
+
+	refperf.loops= [KNL]
+			Set the number of loops over the synchronization
+			primitive under test.  Increasing this number
+			reduces noise due to loop start/end overhead,
+			but the default has already reduced the per-pass
+			noise to a handful of picoseconds on ca. 2020
+			x86 laptops.
+
+	refperf.nreaders= [KNL]
+			Set number of readers.  The default value of -1
+			selects N, where N is roughly 75% of the number
+			of CPUs.  A value of zero is an interesting choice.
+
+	refperf.nruns= [KNL]
+			Set number of runs, each of which is dumped onto
+			the console log.
+
+	refperf.readdelay= [KNL]
+			Set the read-side critical-section duration,
+			measured in microseconds.
+
+	refperf.shutdown= [KNL]
+			Shut down the system at the end of the performance
+			test.  This defaults to 1 (shut it down) when
+			rcuperf is built into the kernel and to 0 (leave
+			it running) when rcuperf is built as a module.
+
+	refperf.verbose= [KNL]
+			Enable additional printk() statements.
+
 	relax_domain_level=
 			[KNL, SMP] Set scheduler's default relax_domain_level.
 			See Documentation/admin-guide/cgroup-v1/cpusets.rst.

From 7c944d7c67daee84e3c756bb74ad2f32b28c41cf Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Fri, 29 May 2020 14:36:26 -0700
Subject: [PATCH 154/502] refperf: Work around 64-bit division
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

A 64-bit division was introduced in refperf, breaking compilation
on all 32-bit architectures:

kernel/rcu/refperf.o: in function `main_func':
refperf.c:(.text+0x57c): undefined reference to `__aeabi_uldivmod'

Fix this by using div_u64 to mark the expensive operation.

[ paulmck: Update primitive and format per Nathan Chancellor. ]
Fixes: bd5b16d6c88d ("refperf: Allow decimal nanoseconds")
Reported-by: kbuild test robot <lkp@intel.com>
Reported-by: Valdis Klētnieks <valdis.kletnieks@vt.edu>
Acked-by: Randy Dunlap <rdunlap@infradead.org> # build-tested
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 kernel/rcu/refperf.c | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/kernel/rcu/refperf.c b/kernel/rcu/refperf.c
index 063eeb0473a1..80d449060bdf 100644
--- a/kernel/rcu/refperf.c
+++ b/kernel/rcu/refperf.c
@@ -478,7 +478,7 @@ static int main_func(void *arg)
 		if (torture_must_stop())
 			goto end;
 
-		result_avg[exp] = 1000 * process_durations(nreaders) / (nreaders * loops);
+		result_avg[exp] = div_u64(1000 * process_durations(nreaders), nreaders * loops);
 	}
 
 	// Print the average of all experiments
@@ -489,9 +489,13 @@ static int main_func(void *arg)
 	strcat(buf, "Runs\tTime(ns)\n");
 
 	for (exp = 0; exp < nruns; exp++) {
+		u64 avg;
+		u32 rem;
+
 		if (errexit)
 			break;
-		sprintf(buf1, "%d\t%llu.%03d\n", exp + 1, result_avg[exp] / 1000, (int)(result_avg[exp] % 1000));
+		avg = div_u64_rem(result_avg[exp], 1000, &rem);
+		sprintf(buf1, "%d\t%llu.%03u\n", exp + 1, avg, rem);
 		strcat(buf, buf1);
 	}
 

From 918b351d965560c7902ad482cf87049517843ff2 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Sun, 31 May 2020 18:14:57 -0700
Subject: [PATCH 155/502] refperf: Change readdelay module parameter to
 nanoseconds

The current units of microseconds are too coarse, so this commit
changes the units to nanoseconds.  However, ndelay is used only for the
nanoseconds with udelay being used for whole microseconds.  For example,
setting refperf.readdelay=1500 results in a udelay(1) followed by an
ndelay(500).

Suggested-by: Akira Yokosawa <akiyks@gmail.com>
[ paulmck: Abstracted delay per Akira feedback and move from 80 to 100 lines. ]
[ paulmck: Fix names as suggested by kbuild test robot. ]
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 kernel/rcu/refperf.c | 36 ++++++++++++++++++++++--------------
 1 file changed, 22 insertions(+), 14 deletions(-)

diff --git a/kernel/rcu/refperf.c b/kernel/rcu/refperf.c
index 80d449060bdf..49fffb9bce77 100644
--- a/kernel/rcu/refperf.c
+++ b/kernel/rcu/refperf.c
@@ -66,8 +66,8 @@ torture_param(long, loops, 10000, "Number of loops per experiment.");
 torture_param(int, nreaders, -1, "Number of readers, -1 for 75% of CPUs.");
 // Number of runs.
 torture_param(int, nruns, 30, "Number of experiments to run.");
-// Reader delay in microseconds, 0 for no delay.
-torture_param(int, readdelay, 0, "Read-side delay in microseconds.");
+// Reader delay in nanoseconds, 0 for no delay.
+torture_param(int, readdelay, 0, "Read-side delay in nanoseconds.");
 
 #ifdef MODULE
 # define REFPERF_SHUTDOWN 0
@@ -111,12 +111,20 @@ struct ref_perf_ops {
 	void (*init)(void);
 	void (*cleanup)(void);
 	void (*readsection)(const int nloops);
-	void (*delaysection)(const int nloops, const int ndelay);
+	void (*delaysection)(const int nloops, const int udl, const int ndl);
 	const char *name;
 };
 
 static struct ref_perf_ops *cur_ops;
 
+static void un_delay(const int udl, const int ndl)
+{
+	if (udl)
+		udelay(udl);
+	if (ndl)
+		ndelay(ndl);
+}
+
 static void ref_rcu_read_section(const int nloops)
 {
 	int i;
@@ -127,13 +135,13 @@ static void ref_rcu_read_section(const int nloops)
 	}
 }
 
-static void ref_rcu_delay_section(const int nloops, const int ndelay)
+static void ref_rcu_delay_section(const int nloops, const int udl, const int ndl)
 {
 	int i;
 
 	for (i = nloops; i >= 0; i--) {
 		rcu_read_lock();
-		udelay(ndelay);
+		un_delay(udl, ndl);
 		rcu_read_unlock();
 	}
 }
@@ -165,14 +173,14 @@ static void srcu_ref_perf_read_section(const int nloops)
 	}
 }
 
-static void srcu_ref_perf_delay_section(const int nloops, const int ndelay)
+static void srcu_ref_perf_delay_section(const int nloops, const int udl, const int ndl)
 {
 	int i;
 	int idx;
 
 	for (i = nloops; i >= 0; i--) {
 		idx = srcu_read_lock(srcu_ctlp);
-		udelay(ndelay);
+		un_delay(udl, ndl);
 		srcu_read_unlock(srcu_ctlp, idx);
 	}
 }
@@ -197,13 +205,13 @@ static void ref_refcnt_section(const int nloops)
 	}
 }
 
-static void ref_refcnt_delay_section(const int nloops, const int ndelay)
+static void ref_refcnt_delay_section(const int nloops, const int udl, const int ndl)
 {
 	int i;
 
 	for (i = nloops; i >= 0; i--) {
 		atomic_inc(&refcnt);
-		udelay(ndelay);
+		un_delay(udl, ndl);
 		atomic_dec(&refcnt);
 	}
 }
@@ -233,13 +241,13 @@ static void ref_rwlock_section(const int nloops)
 	}
 }
 
-static void ref_rwlock_delay_section(const int nloops, const int ndelay)
+static void ref_rwlock_delay_section(const int nloops, const int udl, const int ndl)
 {
 	int i;
 
 	for (i = nloops; i >= 0; i--) {
 		read_lock(&test_rwlock);
-		udelay(ndelay);
+		un_delay(udl, ndl);
 		read_unlock(&test_rwlock);
 	}
 }
@@ -269,13 +277,13 @@ static void ref_rwsem_section(const int nloops)
 	}
 }
 
-static void ref_rwsem_delay_section(const int nloops, const int ndelay)
+static void ref_rwsem_delay_section(const int nloops, const int udl, const int ndl)
 {
 	int i;
 
 	for (i = nloops; i >= 0; i--) {
 		down_read(&test_rwsem);
-		udelay(ndelay);
+		un_delay(udl, ndl);
 		up_read(&test_rwsem);
 	}
 }
@@ -292,7 +300,7 @@ static void rcu_perf_one_reader(void)
 	if (readdelay <= 0)
 		cur_ops->readsection(loops);
 	else
-		cur_ops->delaysection(loops, readdelay);
+		cur_ops->delaysection(loops, readdelay / 1000, readdelay % 1000);
 }
 
 // Reader kthread.  Repeatedly does empty RCU read-side

From 72bb749e7048d0a8d7663b59ec1a33bd56c51083 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Tue, 2 Jun 2020 08:34:41 -0700
Subject: [PATCH 156/502] refperf: Add test for RCU Tasks Trace readers.

This commit adds testing for RCU Tasks Trace readers to the refperf module.

Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 kernel/rcu/refperf.c | 33 +++++++++++++++++++++++++++++++--
 1 file changed, 31 insertions(+), 2 deletions(-)

diff --git a/kernel/rcu/refperf.c b/kernel/rcu/refperf.c
index 49fffb9bce77..da7de9ac548d 100644
--- a/kernel/rcu/refperf.c
+++ b/kernel/rcu/refperf.c
@@ -25,6 +25,7 @@
 #include <linux/notifier.h>
 #include <linux/percpu.h>
 #include <linux/rcupdate.h>
+#include <linux/rcupdate_trace.h>
 #include <linux/reboot.h>
 #include <linux/sched.h>
 #include <linux/spinlock.h>
@@ -157,7 +158,6 @@ static struct ref_perf_ops rcu_ops = {
 	.name		= "rcu"
 };
 
-
 // Definitions for SRCU ref perf testing.
 DEFINE_STATIC_SRCU(srcu_refctl_perf);
 static struct srcu_struct *srcu_ctlp = &srcu_refctl_perf;
@@ -192,6 +192,35 @@ static struct ref_perf_ops srcu_ops = {
 	.name		= "srcu"
 };
 
+// Definitions for RCU Tasks Trace ref perf testing.
+static void rcu_trace_ref_perf_read_section(const int nloops)
+{
+	int i;
+
+	for (i = nloops; i >= 0; i--) {
+		rcu_read_lock_trace();
+		rcu_read_unlock_trace();
+	}
+}
+
+static void rcu_trace_ref_perf_delay_section(const int nloops, const int udl, const int ndl)
+{
+	int i;
+
+	for (i = nloops; i >= 0; i--) {
+		rcu_read_lock_trace();
+		un_delay(udl, ndl);
+		rcu_read_unlock_trace();
+	}
+}
+
+static struct ref_perf_ops rcu_trace_ops = {
+	.init		= rcu_sync_perf_init,
+	.readsection	= rcu_trace_ref_perf_read_section,
+	.delaysection	= rcu_trace_ref_perf_delay_section,
+	.name		= "rcu-trace"
+};
+
 // Definitions for reference count
 static atomic_t refcnt;
 
@@ -584,7 +613,7 @@ ref_perf_init(void)
 	long i;
 	int firsterr = 0;
 	static struct ref_perf_ops *perf_ops[] = {
-		&rcu_ops, &srcu_ops, &refcnt_ops, &rwlock_ops, &rwsem_ops,
+		&rcu_ops, &srcu_ops, &rcu_trace_ops, &refcnt_ops, &rwlock_ops, &rwsem_ops,
 	};
 
 	if (!torture_init_begin(perf_type, verbose))

From e13ef442fe522fa1f604efec8c899a0e1fc3d426 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Wed, 3 Jun 2020 11:56:34 -0700
Subject: [PATCH 157/502] refperf: Add test for RCU Tasks readers

This commit adds testing for RCU Tasks readers to the refperf module.
This also applies to RCU Rude readers, as both flavors have empty
(as in non-existent) read-side markers.

Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 kernel/rcu/refperf.c | 28 +++++++++++++++++++++++++++-
 1 file changed, 27 insertions(+), 1 deletion(-)

diff --git a/kernel/rcu/refperf.c b/kernel/rcu/refperf.c
index da7de9ac548d..2bfdcdcb6bd1 100644
--- a/kernel/rcu/refperf.c
+++ b/kernel/rcu/refperf.c
@@ -192,6 +192,31 @@ static struct ref_perf_ops srcu_ops = {
 	.name		= "srcu"
 };
 
+// Definitions for RCU Tasks ref perf testing: Empty read markers.
+// These definitions also work for RCU Rude readers.
+static void rcu_tasks_ref_perf_read_section(const int nloops)
+{
+	int i;
+
+	for (i = nloops; i >= 0; i--)
+		continue;
+}
+
+static void rcu_tasks_ref_perf_delay_section(const int nloops, const int udl, const int ndl)
+{
+	int i;
+
+	for (i = nloops; i >= 0; i--)
+		un_delay(udl, ndl);
+}
+
+static struct ref_perf_ops rcu_tasks_ops = {
+	.init		= rcu_sync_perf_init,
+	.readsection	= rcu_tasks_ref_perf_read_section,
+	.delaysection	= rcu_tasks_ref_perf_delay_section,
+	.name		= "rcu-tasks"
+};
+
 // Definitions for RCU Tasks Trace ref perf testing.
 static void rcu_trace_ref_perf_read_section(const int nloops)
 {
@@ -613,7 +638,8 @@ ref_perf_init(void)
 	long i;
 	int firsterr = 0;
 	static struct ref_perf_ops *perf_ops[] = {
-		&rcu_ops, &srcu_ops, &rcu_trace_ops, &refcnt_ops, &rwlock_ops, &rwsem_ops,
+		&rcu_ops, &srcu_ops, &rcu_trace_ops, &rcu_tasks_ops,
+		&refcnt_ops, &rwlock_ops, &rwsem_ops,
 	};
 
 	if (!torture_init_begin(perf_type, verbose))

From c7dcf8106f7570b133b05ff68fd4100064965d9d Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Fri, 12 Jun 2020 13:11:29 -0700
Subject: [PATCH 158/502] rcu-tasks: Fix synchronize_rcu_tasks_trace() header
 comment

The synchronize_rcu_tasks_trace() header comment incorrectly claims that
any number of things delimit RCU Tasks Trace read-side critical sections,
when in fact only rcu_read_lock_trace() and rcu_read_unlock_trace() do so.
This commit therefore fixes this comment, and, while in the area, fixes
a typo in the rcu_read_lock_trace() header comment.

Reported-by: Alexei Starovoitov <alexei.starovoitov@gmail.com>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 include/linux/rcupdate_trace.h | 4 ++--
 kernel/rcu/tasks.h             | 9 ++++-----
 2 files changed, 6 insertions(+), 7 deletions(-)

diff --git a/include/linux/rcupdate_trace.h b/include/linux/rcupdate_trace.h
index 4c25a41f8b27..d9015aac78c6 100644
--- a/include/linux/rcupdate_trace.h
+++ b/include/linux/rcupdate_trace.h
@@ -36,8 +36,8 @@ void rcu_read_unlock_trace_special(struct task_struct *t, int nesting);
 /**
  * rcu_read_lock_trace - mark beginning of RCU-trace read-side critical section
  *
- * When synchronize_rcu_trace() is invoked by one task, then that task
- * is guaranteed to block until all other tasks exit their read-side
+ * When synchronize_rcu_tasks_trace() is invoked by one task, then that
+ * task is guaranteed to block until all other tasks exit their read-side
  * critical sections.  Similarly, if call_rcu_trace() is invoked on one
  * task while other tasks are within RCU read-side critical sections,
  * invocation of the corresponding RCU callback is deferred until after
diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h
index ce23f6cc5043..a77298c1d126 100644
--- a/kernel/rcu/tasks.h
+++ b/kernel/rcu/tasks.h
@@ -1118,11 +1118,10 @@ EXPORT_SYMBOL_GPL(call_rcu_tasks_trace);
  * synchronize_rcu_tasks_trace - wait for a trace rcu-tasks grace period
  *
  * Control will return to the caller some time after a trace rcu-tasks
- * grace period has elapsed, in other words after all currently
- * executing rcu-tasks read-side critical sections have elapsed.  These
- * read-side critical sections are delimited by calls to schedule(),
- * cond_resched_tasks_rcu_qs(), userspace execution, and (in theory,
- * anyway) cond_resched().
+ * grace period has elapsed, in other words after all currently executing
+ * rcu-tasks read-side critical sections have elapsed.  These read-side
+ * critical sections are delimited by calls to rcu_read_lock_trace()
+ * and rcu_read_unlock_trace().
  *
  * This is a very specialized primitive, intended only for a few uses in
  * tracing and other situations requiring manipulation of function preambles

From 8e4ec3d02b549a731c94b4bcddff212bb92cdbaf Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Wed, 17 Jun 2020 11:33:54 -0700
Subject: [PATCH 159/502] refperf: Rename RCU_REF_PERF_TEST to
 RCU_REF_SCALE_TEST

The old Kconfig option name is all too easy to conflate with the
unrelated "perf" feature, so this commit renames RCU_REF_PERF_TEST to
RCU_REF_SCALE_TEST.

Reported-by: Ingo Molnar <mingo@kernel.org>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 kernel/rcu/Kconfig.debug                                    | 4 ++--
 kernel/rcu/Makefile                                         | 2 +-
 kernel/rcu/refperf.c                                        | 6 +++---
 tools/testing/selftests/rcutorture/configs/refperf/CFcommon | 2 +-
 4 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/kernel/rcu/Kconfig.debug b/kernel/rcu/Kconfig.debug
index 858765b7f644..3cf6132a4bb9 100644
--- a/kernel/rcu/Kconfig.debug
+++ b/kernel/rcu/Kconfig.debug
@@ -61,8 +61,8 @@ config RCU_TORTURE_TEST
 	  Say M if you want the RCU torture tests to build as a module.
 	  Say N if you are unsure.
 
-config RCU_REF_PERF_TEST
-	tristate "Performance tests for read-side synchronization (RCU and others)"
+config RCU_REF_SCALE_TEST
+	tristate "Scalability tests for read-side synchronization (RCU and others)"
 	depends on DEBUG_KERNEL
 	select TORTURE_TEST
 	select SRCU
diff --git a/kernel/rcu/Makefile b/kernel/rcu/Makefile
index ba7d82609cbe..45d562de279a 100644
--- a/kernel/rcu/Makefile
+++ b/kernel/rcu/Makefile
@@ -12,7 +12,7 @@ obj-$(CONFIG_TREE_SRCU) += srcutree.o
 obj-$(CONFIG_TINY_SRCU) += srcutiny.o
 obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o
 obj-$(CONFIG_RCU_PERF_TEST) += rcuperf.o
-obj-$(CONFIG_RCU_REF_PERF_TEST) += refperf.o
+obj-$(CONFIG_RCU_REF_SCALE_TEST) += refperf.o
 obj-$(CONFIG_TREE_RCU) += tree.o
 obj-$(CONFIG_TINY_RCU) += tiny.o
 obj-$(CONFIG_RCU_NEED_SEGCBLIST) += rcu_segcblist.o
diff --git a/kernel/rcu/refperf.c b/kernel/rcu/refperf.c
index 2bfdcdcb6bd1..7c980573acbe 100644
--- a/kernel/rcu/refperf.c
+++ b/kernel/rcu/refperf.c
@@ -1,6 +1,6 @@
 // SPDX-License-Identifier: GPL-2.0+
 //
-// Performance test comparing RCU vs other mechanisms
+// Scalability test comparing RCU vs other mechanisms
 // for acquiring references on objects.
 //
 // Copyright (C) Google, 2020.
@@ -59,7 +59,7 @@ MODULE_PARM_DESC(perf_type, "Type of test (rcu, srcu, refcnt, rwsem, rwlock.");
 torture_param(int, verbose, 0, "Enable verbose debugging printk()s");
 
 // Wait until there are multiple CPUs before starting test.
-torture_param(int, holdoff, IS_BUILTIN(CONFIG_RCU_REF_PERF_TEST) ? 10 : 0,
+torture_param(int, holdoff, IS_BUILTIN(CONFIG_RCU_REF_SCALE_TEST) ? 10 : 0,
 	      "Holdoff time before test start (s)");
 // Number of loops per experiment, all readers execute operations concurrently.
 torture_param(long, loops, 10000, "Number of loops per experiment.");
@@ -656,7 +656,7 @@ ref_perf_init(void)
 		for (i = 0; i < ARRAY_SIZE(perf_ops); i++)
 			pr_cont(" %s", perf_ops[i]->name);
 		pr_cont("\n");
-		WARN_ON(!IS_MODULE(CONFIG_RCU_REF_PERF_TEST));
+		WARN_ON(!IS_MODULE(CONFIG_RCU_REF_SCALE_TEST));
 		firsterr = -EINVAL;
 		cur_ops = NULL;
 		goto unwind;
diff --git a/tools/testing/selftests/rcutorture/configs/refperf/CFcommon b/tools/testing/selftests/rcutorture/configs/refperf/CFcommon
index 8ba5ba207503..a98b58b54bb1 100644
--- a/tools/testing/selftests/rcutorture/configs/refperf/CFcommon
+++ b/tools/testing/selftests/rcutorture/configs/refperf/CFcommon
@@ -1,2 +1,2 @@
-CONFIG_RCU_REF_PERF_TEST=y
+CONFIG_RCU_REF_SCALE_TEST=y
 CONFIG_PRINTK_TIME=y

From 1fbeb3a8c4de29433a8d230ee600b13d369b6c0f Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Wed, 17 Jun 2020 11:53:53 -0700
Subject: [PATCH 160/502] refperf: Rename refperf.c to refscale.c and change
 internal names

This commit further avoids conflation of refperf with the kernel's perf
feature by renaming kernel/rcu/refperf.c to kernel/rcu/refscale.c,
and also by similarly renaming the functions and variables inside
this file.  This has the side effect of changing the names of the
kernel boot parameters, so kernel-parameters.txt and ver_functions.sh
are also updated.

The rcutorture --torture type remains refperf, and this will be
addressed in a separate commit.

Reported-by: Ingo Molnar <mingo@kernel.org>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 .../admin-guide/kernel-parameters.txt         |  17 +-
 kernel/rcu/Makefile                           |   2 +-
 kernel/rcu/{refperf.c => refscale.c}          | 182 +++++++++---------
 .../configs/refperf/ver_functions.sh          |   4 +-
 4 files changed, 104 insertions(+), 101 deletions(-)
 rename kernel/rcu/{refperf.c => refscale.c} (74%)

diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index 20cd00b78fc4..a4e4e0f6a550 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -4407,13 +4407,13 @@
 			      reboot_cpu is s[mp]#### with #### being the processor
 					to be used for rebooting.
 
-	refperf.holdoff= [KNL]
+	refscale.holdoff= [KNL]
 			Set test-start holdoff period.  The purpose of
 			this parameter is to delay the start of the
 			test until boot completes in order to avoid
 			interference.
 
-	refperf.loops= [KNL]
+	refscale.loops= [KNL]
 			Set the number of loops over the synchronization
 			primitive under test.  Increasing this number
 			reduces noise due to loop start/end overhead,
@@ -4421,26 +4421,29 @@
 			noise to a handful of picoseconds on ca. 2020
 			x86 laptops.
 
-	refperf.nreaders= [KNL]
+	refscale.nreaders= [KNL]
 			Set number of readers.  The default value of -1
 			selects N, where N is roughly 75% of the number
 			of CPUs.  A value of zero is an interesting choice.
 
-	refperf.nruns= [KNL]
+	refscale.nruns= [KNL]
 			Set number of runs, each of which is dumped onto
 			the console log.
 
-	refperf.readdelay= [KNL]
+	refscale.readdelay= [KNL]
 			Set the read-side critical-section duration,
 			measured in microseconds.
 
-	refperf.shutdown= [KNL]
+	refscale.scale_type= [KNL]
+			Specify the read-protection implementation to test.
+
+	refscale.shutdown= [KNL]
 			Shut down the system at the end of the performance
 			test.  This defaults to 1 (shut it down) when
 			rcuperf is built into the kernel and to 0 (leave
 			it running) when rcuperf is built as a module.
 
-	refperf.verbose= [KNL]
+	refscale.verbose= [KNL]
 			Enable additional printk() statements.
 
 	relax_domain_level=
diff --git a/kernel/rcu/Makefile b/kernel/rcu/Makefile
index 45d562de279a..95f5117ef8da 100644
--- a/kernel/rcu/Makefile
+++ b/kernel/rcu/Makefile
@@ -12,7 +12,7 @@ obj-$(CONFIG_TREE_SRCU) += srcutree.o
 obj-$(CONFIG_TINY_SRCU) += srcutiny.o
 obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o
 obj-$(CONFIG_RCU_PERF_TEST) += rcuperf.o
-obj-$(CONFIG_RCU_REF_SCALE_TEST) += refperf.o
+obj-$(CONFIG_RCU_REF_SCALE_TEST) += refscale.o
 obj-$(CONFIG_TREE_RCU) += tree.o
 obj-$(CONFIG_TINY_RCU) += tiny.o
 obj-$(CONFIG_RCU_NEED_SEGCBLIST) += rcu_segcblist.o
diff --git a/kernel/rcu/refperf.c b/kernel/rcu/refscale.c
similarity index 74%
rename from kernel/rcu/refperf.c
rename to kernel/rcu/refscale.c
index 7c980573acbe..d9291f883b54 100644
--- a/kernel/rcu/refperf.c
+++ b/kernel/rcu/refscale.c
@@ -38,23 +38,23 @@
 
 #include "rcu.h"
 
-#define PERF_FLAG "-ref-perf: "
+#define SCALE_FLAG "-ref-scale: "
 
-#define PERFOUT(s, x...) \
-	pr_alert("%s" PERF_FLAG s, perf_type, ## x)
+#define SCALEOUT(s, x...) \
+	pr_alert("%s" SCALE_FLAG s, scale_type, ## x)
 
-#define VERBOSE_PERFOUT(s, x...) \
-	do { if (verbose) pr_alert("%s" PERF_FLAG s, perf_type, ## x); } while (0)
+#define VERBOSE_SCALEOUT(s, x...) \
+	do { if (verbose) pr_alert("%s" SCALE_FLAG s, scale_type, ## x); } while (0)
 
-#define VERBOSE_PERFOUT_ERRSTRING(s, x...) \
-	do { if (verbose) pr_alert("%s" PERF_FLAG "!!! " s, perf_type, ## x); } while (0)
+#define VERBOSE_SCALEOUT_ERRSTRING(s, x...) \
+	do { if (verbose) pr_alert("%s" SCALE_FLAG "!!! " s, scale_type, ## x); } while (0)
 
 MODULE_LICENSE("GPL");
 MODULE_AUTHOR("Joel Fernandes (Google) <joel@joelfernandes.org>");
 
-static char *perf_type = "rcu";
-module_param(perf_type, charp, 0444);
-MODULE_PARM_DESC(perf_type, "Type of test (rcu, srcu, refcnt, rwsem, rwlock.");
+static char *scale_type = "rcu";
+module_param(scale_type, charp, 0444);
+MODULE_PARM_DESC(scale_type, "Type of test (rcu, srcu, refcnt, rwsem, rwlock.");
 
 torture_param(int, verbose, 0, "Enable verbose debugging printk()s");
 
@@ -71,13 +71,13 @@ torture_param(int, nruns, 30, "Number of experiments to run.");
 torture_param(int, readdelay, 0, "Read-side delay in nanoseconds.");
 
 #ifdef MODULE
-# define REFPERF_SHUTDOWN 0
+# define REFSCALE_SHUTDOWN 0
 #else
-# define REFPERF_SHUTDOWN 1
+# define REFSCALE_SHUTDOWN 1
 #endif
 
-torture_param(bool, shutdown, REFPERF_SHUTDOWN,
-	      "Shutdown at end of performance tests.");
+torture_param(bool, shutdown, REFSCALE_SHUTDOWN,
+	      "Shutdown at end of scalability tests.");
 
 struct reader_task {
 	struct task_struct *task;
@@ -108,7 +108,7 @@ static atomic_t n_cooleddown;
 static int exp_idx;
 
 // Operations vector for selecting different types of tests.
-struct ref_perf_ops {
+struct ref_scale_ops {
 	void (*init)(void);
 	void (*cleanup)(void);
 	void (*readsection)(const int nloops);
@@ -116,7 +116,7 @@ struct ref_perf_ops {
 	const char *name;
 };
 
-static struct ref_perf_ops *cur_ops;
+static struct ref_scale_ops *cur_ops;
 
 static void un_delay(const int udl, const int ndl)
 {
@@ -147,22 +147,22 @@ static void ref_rcu_delay_section(const int nloops, const int udl, const int ndl
 	}
 }
 
-static void rcu_sync_perf_init(void)
+static void rcu_sync_scale_init(void)
 {
 }
 
-static struct ref_perf_ops rcu_ops = {
-	.init		= rcu_sync_perf_init,
+static struct ref_scale_ops rcu_ops = {
+	.init		= rcu_sync_scale_init,
 	.readsection	= ref_rcu_read_section,
 	.delaysection	= ref_rcu_delay_section,
 	.name		= "rcu"
 };
 
-// Definitions for SRCU ref perf testing.
-DEFINE_STATIC_SRCU(srcu_refctl_perf);
-static struct srcu_struct *srcu_ctlp = &srcu_refctl_perf;
+// Definitions for SRCU ref scale testing.
+DEFINE_STATIC_SRCU(srcu_refctl_scale);
+static struct srcu_struct *srcu_ctlp = &srcu_refctl_scale;
 
-static void srcu_ref_perf_read_section(const int nloops)
+static void srcu_ref_scale_read_section(const int nloops)
 {
 	int i;
 	int idx;
@@ -173,7 +173,7 @@ static void srcu_ref_perf_read_section(const int nloops)
 	}
 }
 
-static void srcu_ref_perf_delay_section(const int nloops, const int udl, const int ndl)
+static void srcu_ref_scale_delay_section(const int nloops, const int udl, const int ndl)
 {
 	int i;
 	int idx;
@@ -185,16 +185,16 @@ static void srcu_ref_perf_delay_section(const int nloops, const int udl, const i
 	}
 }
 
-static struct ref_perf_ops srcu_ops = {
-	.init		= rcu_sync_perf_init,
-	.readsection	= srcu_ref_perf_read_section,
-	.delaysection	= srcu_ref_perf_delay_section,
+static struct ref_scale_ops srcu_ops = {
+	.init		= rcu_sync_scale_init,
+	.readsection	= srcu_ref_scale_read_section,
+	.delaysection	= srcu_ref_scale_delay_section,
 	.name		= "srcu"
 };
 
-// Definitions for RCU Tasks ref perf testing: Empty read markers.
+// Definitions for RCU Tasks ref scale testing: Empty read markers.
 // These definitions also work for RCU Rude readers.
-static void rcu_tasks_ref_perf_read_section(const int nloops)
+static void rcu_tasks_ref_scale_read_section(const int nloops)
 {
 	int i;
 
@@ -202,7 +202,7 @@ static void rcu_tasks_ref_perf_read_section(const int nloops)
 		continue;
 }
 
-static void rcu_tasks_ref_perf_delay_section(const int nloops, const int udl, const int ndl)
+static void rcu_tasks_ref_scale_delay_section(const int nloops, const int udl, const int ndl)
 {
 	int i;
 
@@ -210,15 +210,15 @@ static void rcu_tasks_ref_perf_delay_section(const int nloops, const int udl, co
 		un_delay(udl, ndl);
 }
 
-static struct ref_perf_ops rcu_tasks_ops = {
-	.init		= rcu_sync_perf_init,
-	.readsection	= rcu_tasks_ref_perf_read_section,
-	.delaysection	= rcu_tasks_ref_perf_delay_section,
+static struct ref_scale_ops rcu_tasks_ops = {
+	.init		= rcu_sync_scale_init,
+	.readsection	= rcu_tasks_ref_scale_read_section,
+	.delaysection	= rcu_tasks_ref_scale_delay_section,
 	.name		= "rcu-tasks"
 };
 
-// Definitions for RCU Tasks Trace ref perf testing.
-static void rcu_trace_ref_perf_read_section(const int nloops)
+// Definitions for RCU Tasks Trace ref scale testing.
+static void rcu_trace_ref_scale_read_section(const int nloops)
 {
 	int i;
 
@@ -228,7 +228,7 @@ static void rcu_trace_ref_perf_read_section(const int nloops)
 	}
 }
 
-static void rcu_trace_ref_perf_delay_section(const int nloops, const int udl, const int ndl)
+static void rcu_trace_ref_scale_delay_section(const int nloops, const int udl, const int ndl)
 {
 	int i;
 
@@ -239,10 +239,10 @@ static void rcu_trace_ref_perf_delay_section(const int nloops, const int udl, co
 	}
 }
 
-static struct ref_perf_ops rcu_trace_ops = {
-	.init		= rcu_sync_perf_init,
-	.readsection	= rcu_trace_ref_perf_read_section,
-	.delaysection	= rcu_trace_ref_perf_delay_section,
+static struct ref_scale_ops rcu_trace_ops = {
+	.init		= rcu_sync_scale_init,
+	.readsection	= rcu_trace_ref_scale_read_section,
+	.delaysection	= rcu_trace_ref_scale_delay_section,
 	.name		= "rcu-trace"
 };
 
@@ -270,8 +270,8 @@ static void ref_refcnt_delay_section(const int nloops, const int udl, const int
 	}
 }
 
-static struct ref_perf_ops refcnt_ops = {
-	.init		= rcu_sync_perf_init,
+static struct ref_scale_ops refcnt_ops = {
+	.init		= rcu_sync_scale_init,
 	.readsection	= ref_refcnt_section,
 	.delaysection	= ref_refcnt_delay_section,
 	.name		= "refcnt"
@@ -306,7 +306,7 @@ static void ref_rwlock_delay_section(const int nloops, const int udl, const int
 	}
 }
 
-static struct ref_perf_ops rwlock_ops = {
+static struct ref_scale_ops rwlock_ops = {
 	.init		= ref_rwlock_init,
 	.readsection	= ref_rwlock_section,
 	.delaysection	= ref_rwlock_delay_section,
@@ -342,14 +342,14 @@ static void ref_rwsem_delay_section(const int nloops, const int udl, const int n
 	}
 }
 
-static struct ref_perf_ops rwsem_ops = {
+static struct ref_scale_ops rwsem_ops = {
 	.init		= ref_rwsem_init,
 	.readsection	= ref_rwsem_section,
 	.delaysection	= ref_rwsem_delay_section,
 	.name		= "rwsem"
 };
 
-static void rcu_perf_one_reader(void)
+static void rcu_scale_one_reader(void)
 {
 	if (readdelay <= 0)
 		cur_ops->readsection(loops);
@@ -360,7 +360,7 @@ static void rcu_perf_one_reader(void)
 // Reader kthread.  Repeatedly does empty RCU read-side
 // critical section, minimizing update-side interference.
 static int
-ref_perf_reader(void *arg)
+ref_scale_reader(void *arg)
 {
 	unsigned long flags;
 	long me = (long)arg;
@@ -368,14 +368,14 @@ ref_perf_reader(void *arg)
 	u64 start;
 	s64 duration;
 
-	VERBOSE_PERFOUT("ref_perf_reader %ld: task started", me);
+	VERBOSE_SCALEOUT("ref_scale_reader %ld: task started", me);
 	set_cpus_allowed_ptr(current, cpumask_of(me % nr_cpu_ids));
 	set_user_nice(current, MAX_NICE);
 	atomic_inc(&n_init);
 	if (holdoff)
 		schedule_timeout_interruptible(holdoff * HZ);
 repeat:
-	VERBOSE_PERFOUT("ref_perf_reader %ld: waiting to start next experiment on cpu %d", me, smp_processor_id());
+	VERBOSE_SCALEOUT("ref_scale_reader %ld: waiting to start next experiment on cpu %d", me, smp_processor_id());
 
 	// Wait for signal that this reader can start.
 	wait_event(rt->wq, (atomic_read(&nreaders_exp) && smp_load_acquire(&rt->start_reader)) ||
@@ -392,21 +392,21 @@ repeat:
 		while (atomic_read_acquire(&n_started))
 			cpu_relax();
 
-	VERBOSE_PERFOUT("ref_perf_reader %ld: experiment %d started", me, exp_idx);
+	VERBOSE_SCALEOUT("ref_scale_reader %ld: experiment %d started", me, exp_idx);
 
 
 	// To reduce noise, do an initial cache-warming invocation, check
 	// in, and then keep warming until everyone has checked in.
-	rcu_perf_one_reader();
+	rcu_scale_one_reader();
 	if (!atomic_dec_return(&n_warmedup))
 		while (atomic_read_acquire(&n_warmedup))
-			rcu_perf_one_reader();
+			rcu_scale_one_reader();
 	// Also keep interrupts disabled.  This also has the effect
 	// of preventing entries into slow path for rcu_read_unlock().
 	local_irq_save(flags);
 	start = ktime_get_mono_fast_ns();
 
-	rcu_perf_one_reader();
+	rcu_scale_one_reader();
 
 	duration = ktime_get_mono_fast_ns() - start;
 	local_irq_restore(flags);
@@ -416,18 +416,18 @@ repeat:
 	// everyone is done.
 	if (!atomic_dec_return(&n_cooleddown))
 		while (atomic_read_acquire(&n_cooleddown))
-			rcu_perf_one_reader();
+			rcu_scale_one_reader();
 
 	if (atomic_dec_and_test(&nreaders_exp))
 		wake_up(&main_wq);
 
-	VERBOSE_PERFOUT("ref_perf_reader %ld: experiment %d ended, (readers remaining=%d)",
+	VERBOSE_SCALEOUT("ref_scale_reader %ld: experiment %d ended, (readers remaining=%d)",
 			me, exp_idx, atomic_read(&nreaders_exp));
 
 	if (!torture_must_stop())
 		goto repeat;
 end:
-	torture_kthread_stopping("ref_perf_reader");
+	torture_kthread_stopping("ref_scale_reader");
 	return 0;
 }
 
@@ -471,7 +471,7 @@ static u64 process_durations(int n)
 	}
 	strcat(buf, "\n");
 
-	PERFOUT("%s\n", buf);
+	SCALEOUT("%s\n", buf);
 
 	kfree(buf);
 	return sum;
@@ -494,11 +494,11 @@ static int main_func(void *arg)
 	set_cpus_allowed_ptr(current, cpumask_of(nreaders % nr_cpu_ids));
 	set_user_nice(current, MAX_NICE);
 
-	VERBOSE_PERFOUT("main_func task started");
+	VERBOSE_SCALEOUT("main_func task started");
 	result_avg = kzalloc(nruns * sizeof(*result_avg), GFP_KERNEL);
 	buf = kzalloc(64 + nruns * 32, GFP_KERNEL);
 	if (!result_avg || !buf) {
-		VERBOSE_PERFOUT_ERRSTRING("out of memory");
+		VERBOSE_SCALEOUT_ERRSTRING("out of memory");
 		errexit = true;
 	}
 	if (holdoff)
@@ -529,13 +529,13 @@ static int main_func(void *arg)
 			wake_up(&reader_tasks[r].wq);
 		}
 
-		VERBOSE_PERFOUT("main_func: experiment started, waiting for %d readers",
+		VERBOSE_SCALEOUT("main_func: experiment started, waiting for %d readers",
 				nreaders);
 
 		wait_event(main_wq,
 			   !atomic_read(&nreaders_exp) || torture_must_stop());
 
-		VERBOSE_PERFOUT("main_func: experiment ended");
+		VERBOSE_SCALEOUT("main_func: experiment ended");
 
 		if (torture_must_stop())
 			goto end;
@@ -544,7 +544,7 @@ static int main_func(void *arg)
 	}
 
 	// Print the average of all experiments
-	PERFOUT("END OF TEST. Calculating average duration per loop (nanoseconds)...\n");
+	SCALEOUT("END OF TEST. Calculating average duration per loop (nanoseconds)...\n");
 
 	buf[0] = 0;
 	strcat(buf, "\n");
@@ -562,7 +562,7 @@ static int main_func(void *arg)
 	}
 
 	if (!errexit)
-		PERFOUT("%s", buf);
+		SCALEOUT("%s", buf);
 
 	// This will shutdown everything including us.
 	if (shutdown) {
@@ -582,15 +582,15 @@ end:
 }
 
 static void
-ref_perf_print_module_parms(struct ref_perf_ops *cur_ops, const char *tag)
+ref_scale_print_module_parms(struct ref_scale_ops *cur_ops, const char *tag)
 {
-	pr_alert("%s" PERF_FLAG
-		 "--- %s:  verbose=%d shutdown=%d holdoff=%d loops=%ld nreaders=%d nruns=%d readdelay=%d\n", perf_type, tag,
+	pr_alert("%s" SCALE_FLAG
+		 "--- %s:  verbose=%d shutdown=%d holdoff=%d loops=%ld nreaders=%d nruns=%d readdelay=%d\n", scale_type, tag,
 		 verbose, shutdown, holdoff, loops, nreaders, nruns, readdelay);
 }
 
 static void
-ref_perf_cleanup(void)
+ref_scale_cleanup(void)
 {
 	int i;
 
@@ -604,7 +604,7 @@ ref_perf_cleanup(void)
 
 	if (reader_tasks) {
 		for (i = 0; i < nreaders; i++)
-			torture_stop_kthread("ref_perf_reader",
+			torture_stop_kthread("ref_scale_reader",
 					     reader_tasks[i].task);
 	}
 	kfree(reader_tasks);
@@ -612,7 +612,7 @@ ref_perf_cleanup(void)
 	torture_stop_kthread("main_task", main_task);
 	kfree(main_task);
 
-	// Do perf-type-specific cleanup operations.
+	// Do scale-type-specific cleanup operations.
 	if (cur_ops->cleanup != NULL)
 		cur_ops->cleanup();
 
@@ -621,40 +621,40 @@ ref_perf_cleanup(void)
 
 // Shutdown kthread.  Just waits to be awakened, then shuts down system.
 static int
-ref_perf_shutdown(void *arg)
+ref_scale_shutdown(void *arg)
 {
 	wait_event(shutdown_wq, shutdown_start);
 
 	smp_mb(); // Wake before output.
-	ref_perf_cleanup();
+	ref_scale_cleanup();
 	kernel_power_off();
 
 	return -EINVAL;
 }
 
 static int __init
-ref_perf_init(void)
+ref_scale_init(void)
 {
 	long i;
 	int firsterr = 0;
-	static struct ref_perf_ops *perf_ops[] = {
+	static struct ref_scale_ops *scale_ops[] = {
 		&rcu_ops, &srcu_ops, &rcu_trace_ops, &rcu_tasks_ops,
 		&refcnt_ops, &rwlock_ops, &rwsem_ops,
 	};
 
-	if (!torture_init_begin(perf_type, verbose))
+	if (!torture_init_begin(scale_type, verbose))
 		return -EBUSY;
 
-	for (i = 0; i < ARRAY_SIZE(perf_ops); i++) {
-		cur_ops = perf_ops[i];
-		if (strcmp(perf_type, cur_ops->name) == 0)
+	for (i = 0; i < ARRAY_SIZE(scale_ops); i++) {
+		cur_ops = scale_ops[i];
+		if (strcmp(scale_type, cur_ops->name) == 0)
 			break;
 	}
-	if (i == ARRAY_SIZE(perf_ops)) {
-		pr_alert("rcu-perf: invalid perf type: \"%s\"\n", perf_type);
-		pr_alert("rcu-perf types:");
-		for (i = 0; i < ARRAY_SIZE(perf_ops); i++)
-			pr_cont(" %s", perf_ops[i]->name);
+	if (i == ARRAY_SIZE(scale_ops)) {
+		pr_alert("rcu-scale: invalid scale type: \"%s\"\n", scale_type);
+		pr_alert("rcu-scale types:");
+		for (i = 0; i < ARRAY_SIZE(scale_ops); i++)
+			pr_cont(" %s", scale_ops[i]->name);
 		pr_cont("\n");
 		WARN_ON(!IS_MODULE(CONFIG_RCU_REF_SCALE_TEST));
 		firsterr = -EINVAL;
@@ -664,12 +664,12 @@ ref_perf_init(void)
 	if (cur_ops->init)
 		cur_ops->init();
 
-	ref_perf_print_module_parms(cur_ops, "Start of test");
+	ref_scale_print_module_parms(cur_ops, "Start of test");
 
 	// Shutdown task
 	if (shutdown) {
 		init_waitqueue_head(&shutdown_wq);
-		firsterr = torture_create_kthread(ref_perf_shutdown, NULL,
+		firsterr = torture_create_kthread(ref_scale_shutdown, NULL,
 						  shutdown_task);
 		if (firsterr)
 			goto unwind;
@@ -682,15 +682,15 @@ ref_perf_init(void)
 	reader_tasks = kcalloc(nreaders, sizeof(reader_tasks[0]),
 			       GFP_KERNEL);
 	if (!reader_tasks) {
-		VERBOSE_PERFOUT_ERRSTRING("out of memory");
+		VERBOSE_SCALEOUT_ERRSTRING("out of memory");
 		firsterr = -ENOMEM;
 		goto unwind;
 	}
 
-	VERBOSE_PERFOUT("Starting %d reader threads\n", nreaders);
+	VERBOSE_SCALEOUT("Starting %d reader threads\n", nreaders);
 
 	for (i = 0; i < nreaders; i++) {
-		firsterr = torture_create_kthread(ref_perf_reader, (void *)i,
+		firsterr = torture_create_kthread(ref_scale_reader, (void *)i,
 						  reader_tasks[i].task);
 		if (firsterr)
 			goto unwind;
@@ -709,9 +709,9 @@ ref_perf_init(void)
 
 unwind:
 	torture_init_end();
-	ref_perf_cleanup();
+	ref_scale_cleanup();
 	return firsterr;
 }
 
-module_init(ref_perf_init);
-module_exit(ref_perf_cleanup);
+module_init(ref_scale_init);
+module_exit(ref_scale_cleanup);
diff --git a/tools/testing/selftests/rcutorture/configs/refperf/ver_functions.sh b/tools/testing/selftests/rcutorture/configs/refperf/ver_functions.sh
index 489f05dd929a..321e82641287 100644
--- a/tools/testing/selftests/rcutorture/configs/refperf/ver_functions.sh
+++ b/tools/testing/selftests/rcutorture/configs/refperf/ver_functions.sh
@@ -11,6 +11,6 @@
 #
 # Adds per-version torture-module parameters to kernels supporting them.
 per_version_boot_params () {
-	echo $1 refperf.shutdown=1 \
-		refperf.verbose=1
+	echo $1 refscale.shutdown=1 \
+		refscale.verbose=1
 }

From f71d8311ec278525508dac211de700b2b682a15f Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Wed, 17 Jun 2020 12:06:47 -0700
Subject: [PATCH 161/502] refscale: Change --torture type from refperf to
 refscale

This commit renames the rcutorture config/refperf to config/refscale to
further avoid conflation with the Linux kernel's perf feature.

Reported-by: Ingo Molnar <mingo@kernel.org>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 .../{kvm-recheck-refperf.sh => kvm-recheck-refscale.sh}   | 8 ++++----
 tools/testing/selftests/rcutorture/bin/kvm.sh             | 8 ++++----
 tools/testing/selftests/rcutorture/bin/parse-console.sh   | 4 ++--
 .../rcutorture/configs/{refperf => refscale}/CFLIST       | 0
 .../rcutorture/configs/{refperf => refscale}/CFcommon     | 0
 .../rcutorture/configs/{refperf => refscale}/NOPREEMPT    | 0
 .../rcutorture/configs/{refperf => refscale}/PREEMPT      | 0
 .../configs/{refperf => refscale}/ver_functions.sh        | 0
 8 files changed, 10 insertions(+), 10 deletions(-)
 rename tools/testing/selftests/rcutorture/bin/{kvm-recheck-refperf.sh => kvm-recheck-refscale.sh} (87%)
 rename tools/testing/selftests/rcutorture/configs/{refperf => refscale}/CFLIST (100%)
 rename tools/testing/selftests/rcutorture/configs/{refperf => refscale}/CFcommon (100%)
 rename tools/testing/selftests/rcutorture/configs/{refperf => refscale}/NOPREEMPT (100%)
 rename tools/testing/selftests/rcutorture/configs/{refperf => refscale}/PREEMPT (100%)
 rename tools/testing/selftests/rcutorture/configs/{refperf => refscale}/ver_functions.sh (100%)

diff --git a/tools/testing/selftests/rcutorture/bin/kvm-recheck-refperf.sh b/tools/testing/selftests/rcutorture/bin/kvm-recheck-refscale.sh
similarity index 87%
rename from tools/testing/selftests/rcutorture/bin/kvm-recheck-refperf.sh
rename to tools/testing/selftests/rcutorture/bin/kvm-recheck-refscale.sh
index 0e29cfd9986c..35a463dddffe 100755
--- a/tools/testing/selftests/rcutorture/bin/kvm-recheck-refperf.sh
+++ b/tools/testing/selftests/rcutorture/bin/kvm-recheck-refscale.sh
@@ -1,9 +1,9 @@
 #!/bin/bash
 # SPDX-License-Identifier: GPL-2.0+
 #
-# Analyze a given results directory for refperf performance measurements.
+# Analyze a given results directory for refscale performance measurements.
 #
-# Usage: kvm-recheck-refperf.sh resdir
+# Usage: kvm-recheck-refscale.sh resdir
 #
 # Copyright (C) IBM Corporation, 2016
 #
@@ -51,7 +51,7 @@ END {
 	print configfile " results:";
 	newNR = asort(readertimes);
 	if (newNR <= 0) {
-		print "No refperf records found???"
+		print "No refscale records found???"
 		exit;
 	}
 	medianidx = int(newNR / 2);
@@ -67,5 +67,5 @@ END {
 	print "Minimum reader duration: " readertimes[1];
 	print "Median reader duration: " medianvalue;
 	print "Maximum reader duration: " readertimes[newNR];
-	print "Computed from refperf printk output.";
+	print "Computed from refscale printk output.";
 }'
diff --git a/tools/testing/selftests/rcutorture/bin/kvm.sh b/tools/testing/selftests/rcutorture/bin/kvm.sh
index 48b6a7248f50..ce05db324057 100755
--- a/tools/testing/selftests/rcutorture/bin/kvm.sh
+++ b/tools/testing/selftests/rcutorture/bin/kvm.sh
@@ -180,14 +180,14 @@ do
 		shift
 		;;
 	--torture)
-		checkarg --torture "(suite name)" "$#" "$2" '^\(lock\|rcu\|rcuperf\|refperf\)$' '^--'
+		checkarg --torture "(suite name)" "$#" "$2" '^\(lock\|rcu\|rcuperf\|refscale\)$' '^--'
 		TORTURE_SUITE=$2
 		shift
-		if test "$TORTURE_SUITE" = rcuperf || test "$TORTURE_SUITE" = refperf
+		if test "$TORTURE_SUITE" = rcuperf || test "$TORTURE_SUITE" = refscale
 		then
-			# If you really want jitter for refperf or
+			# If you really want jitter for refscale or
 			# rcuperf, specify it after specifying the rcuperf
-			# or the refperf.  (But why jitter in these cases?)
+			# or the refscale.  (But why jitter in these cases?)
 			jitter=0
 		fi
 		;;
diff --git a/tools/testing/selftests/rcutorture/bin/parse-console.sh b/tools/testing/selftests/rcutorture/bin/parse-console.sh
index 85af11d2d0cb..8cb908fb852b 100755
--- a/tools/testing/selftests/rcutorture/bin/parse-console.sh
+++ b/tools/testing/selftests/rcutorture/bin/parse-console.sh
@@ -33,8 +33,8 @@ then
 fi
 cat /dev/null > $file.diags
 
-# Check for proper termination, except for rcuperf and refperf.
-if test "$TORTURE_SUITE" != rcuperf && test "$TORTURE_SUITE" != refperf
+# Check for proper termination, except for rcuperf and refscale.
+if test "$TORTURE_SUITE" != rcuperf && test "$TORTURE_SUITE" != refscale
 then
 	# check for abject failure
 
diff --git a/tools/testing/selftests/rcutorture/configs/refperf/CFLIST b/tools/testing/selftests/rcutorture/configs/refscale/CFLIST
similarity index 100%
rename from tools/testing/selftests/rcutorture/configs/refperf/CFLIST
rename to tools/testing/selftests/rcutorture/configs/refscale/CFLIST
diff --git a/tools/testing/selftests/rcutorture/configs/refperf/CFcommon b/tools/testing/selftests/rcutorture/configs/refscale/CFcommon
similarity index 100%
rename from tools/testing/selftests/rcutorture/configs/refperf/CFcommon
rename to tools/testing/selftests/rcutorture/configs/refscale/CFcommon
diff --git a/tools/testing/selftests/rcutorture/configs/refperf/NOPREEMPT b/tools/testing/selftests/rcutorture/configs/refscale/NOPREEMPT
similarity index 100%
rename from tools/testing/selftests/rcutorture/configs/refperf/NOPREEMPT
rename to tools/testing/selftests/rcutorture/configs/refscale/NOPREEMPT
diff --git a/tools/testing/selftests/rcutorture/configs/refperf/PREEMPT b/tools/testing/selftests/rcutorture/configs/refscale/PREEMPT
similarity index 100%
rename from tools/testing/selftests/rcutorture/configs/refperf/PREEMPT
rename to tools/testing/selftests/rcutorture/configs/refscale/PREEMPT
diff --git a/tools/testing/selftests/rcutorture/configs/refperf/ver_functions.sh b/tools/testing/selftests/rcutorture/configs/refscale/ver_functions.sh
similarity index 100%
rename from tools/testing/selftests/rcutorture/configs/refperf/ver_functions.sh
rename to tools/testing/selftests/rcutorture/configs/refscale/ver_functions.sh

From 7fef6cff8f2814bf8eb632e2bb8f0a987ffd9ece Mon Sep 17 00:00:00 2001
From: Ethon Paul <ethp@qq.com>
Date: Sat, 18 Apr 2020 19:46:47 +0800
Subject: [PATCH 162/502] srcu: Fix a typo in comment "amoritized"->"amortized"

This commit fixes a typo in a comment.

Signed-off-by: Ethon Paul <ethp@qq.com>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 kernel/rcu/srcutree.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c
index 6d3ef700fb0e..8ff71e5d0fe8 100644
--- a/kernel/rcu/srcutree.c
+++ b/kernel/rcu/srcutree.c
@@ -766,7 +766,7 @@ static void srcu_flip(struct srcu_struct *ssp)
  * it, if this function was preempted for enough time for the counters
  * to wrap, it really doesn't matter whether or not we expedite the grace
  * period.  The extra overhead of a needlessly expedited grace period is
- * negligible when amoritized over that time period, and the extra latency
+ * negligible when amortized over that time period, and the extra latency
  * of a needlessly non-expedited grace period is similarly negligible.
  */
 static bool srcu_might_be_idle(struct srcu_struct *ssp)

From bde50d8ff83e4ce9e576f7c5ba1edb48a3610a5b Mon Sep 17 00:00:00 2001
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Tue, 26 May 2020 15:41:34 +0200
Subject: [PATCH 163/502] srcu: Avoid local_irq_save() before acquiring
 spinlock_t

SRCU disables interrupts to get a stable per-CPU pointer and then
acquires the spinlock which is in the per-CPU data structure. The
release uses spin_unlock_irqrestore(). While this is correct on a non-RT
kernel, this conflicts with the RT semantics because the spinlock is
converted to a 'sleeping' spinlock. Sleeping locks can obviously not be
acquired with interrupts disabled.

Acquire the per-CPU pointer `ssp->sda' without disabling preemption and
then acquire the spinlock_t of the per-CPU data structure. The lock will
ensure that the data is consistent.

The added call to check_init_srcu_struct() is now needed because a
statically defined srcu_struct may remain uninitialized until this
point and the newly introduced locking operation requires an initialized
spinlock_t.

This change was tested for four hours with 8*SRCU-N and 8*SRCU-P without
causing any warnings.

Cc: Lai Jiangshan <jiangshanlai@gmail.com>
Cc: "Paul E. McKenney" <paulmck@kernel.org>
Cc: Josh Triplett <josh@joshtriplett.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: rcu@vger.kernel.org
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 kernel/rcu/srcutree.c | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c
index 8ff71e5d0fe8..c100acf332ed 100644
--- a/kernel/rcu/srcutree.c
+++ b/kernel/rcu/srcutree.c
@@ -777,14 +777,15 @@ static bool srcu_might_be_idle(struct srcu_struct *ssp)
 	unsigned long t;
 	unsigned long tlast;
 
+	check_init_srcu_struct(ssp);
 	/* If the local srcu_data structure has callbacks, not idle.  */
-	local_irq_save(flags);
-	sdp = this_cpu_ptr(ssp->sda);
+	sdp = raw_cpu_ptr(ssp->sda);
+	spin_lock_irqsave_rcu_node(sdp, flags);
 	if (rcu_segcblist_pend_cbs(&sdp->srcu_cblist)) {
-		local_irq_restore(flags);
+		spin_unlock_irqrestore_rcu_node(sdp, flags);
 		return false; /* Callbacks already present, so not idle. */
 	}
-	local_irq_restore(flags);
+	spin_unlock_irqrestore_rcu_node(sdp, flags);
 
 	/*
 	 * No local callbacks, so probabalistically probe global state.
@@ -864,9 +865,8 @@ static void __call_srcu(struct srcu_struct *ssp, struct rcu_head *rhp,
 	}
 	rhp->func = func;
 	idx = srcu_read_lock(ssp);
-	local_irq_save(flags);
-	sdp = this_cpu_ptr(ssp->sda);
-	spin_lock_rcu_node(sdp);
+	sdp = raw_cpu_ptr(ssp->sda);
+	spin_lock_irqsave_rcu_node(sdp, flags);
 	rcu_segcblist_enqueue(&sdp->srcu_cblist, rhp);
 	rcu_segcblist_advance(&sdp->srcu_cblist,
 			      rcu_seq_current(&ssp->srcu_gp_seq));

From 88513ae533756d10358e406743c21e8cf61fb72a Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Tue, 28 Apr 2020 14:41:48 -0700
Subject: [PATCH 164/502] torture:  Remove qemu dependency on EFI firmware

On some (probably misconfigured) systems, the torture-test scripting
will cause qemu to complain about missing EFI firmware, often because
qemu is trying to traverse broken symbolic links to find that firmware.
Which is a bit silly given that the default torture-test guest OS has
but a single binary for its userspace, and thus is unlikely to do much
in the way of networking in any case.

This commit therefore avoids such problems by specifying "-net none"
to qemu unless the TORTURE_QEMU_INTERACTIVE environment variable is set
(for example, by having specified "--interactive" to kvm.sh), in which
case "-net nic -net user" is specified to qemu instead.  Either choice
may be overridden by specifying the "-net" argument of your choice to
the kvm.sh "--qemu-args" parameter.

Link: https://lore.kernel.org/lkml/20190701141403.GA246562@google.com
Reported-by: Joel Fernandes <joel@joelfernandes.org>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
Cc: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
 .../selftests/rcutorture/bin/functions.sh     | 21 ++++++++++++++++---
 .../rcutorture/bin/kvm-test-1-run.sh          |  1 +
 2 files changed, 19 insertions(+), 3 deletions(-)

diff --git a/tools/testing/selftests/rcutorture/bin/functions.sh b/tools/testing/selftests/rcutorture/bin/functions.sh
index 12810229fddc..436b1542cf27 100644
--- a/tools/testing/selftests/rcutorture/bin/functions.sh
+++ b/tools/testing/selftests/rcutorture/bin/functions.sh
@@ -215,9 +215,6 @@ identify_qemu_args () {
 		then
 			echo -device spapr-vlan,netdev=net0,mac=$TORTURE_QEMU_MAC
 			echo -netdev bridge,br=br0,id=net0
-		elif test -n "$TORTURE_QEMU_INTERACTIVE"
-		then
-			echo -net nic -net user
 		fi
 		;;
 	esac
@@ -275,3 +272,21 @@ specify_qemu_cpus () {
 		esac
 	fi
 }
+
+# specify_qemu_net qemu-args
+#
+# Appends a string containing "-net none" to qemu-args, unless the incoming
+# qemu-args already contains "-smp" or unless the TORTURE_QEMU_INTERACTIVE
+# environment variable is set, in which case the string that is be added is
+# instead "-net nic -net user".
+specify_qemu_net () {
+	if echo $1 | grep -q -e -net
+	then
+		echo $1
+	elif test -n "$TORTURE_QEMU_INTERACTIVE"
+	then
+		echo $1 -net nic -net user
+	else
+		echo $1 -net none
+	fi
+}
diff --git a/tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh b/tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh
index 6ff611c630d1..1b9aebd54cc9 100755
--- a/tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh
+++ b/tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh
@@ -141,6 +141,7 @@ then
 	cpu_count=$TORTURE_ALLOTED_CPUS
 fi
 qemu_args="`specify_qemu_cpus "$QEMU" "$qemu_args" "$cpu_count"`"
+qemu_args="`specify_qemu_net "$qemu_args"`"
 
 # Generate architecture-specific and interaction-specific qemu arguments
 qemu_args="$qemu_args `identify_qemu_args "$QEMU" "$resdir/console.log"`"

From 6582e7f184e49a754ee09c996a886b89113d7354 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Mon, 4 May 2020 15:55:47 -0700
Subject: [PATCH 165/502] torture: Add script to smoke-test commits in a branch

This commit adds a kvm-check-branches.sh script that takes a list
of commits and commit ranges and runs a short rcutorture test on all
scenarios on each specified commit.  A summary is printed at the end, and
the script returns success if all rcutorture runs completed without error.

Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 .../rcutorture/bin/kvm-check-branches.sh      | 108 ++++++++++++++++++
 1 file changed, 108 insertions(+)
 create mode 100755 tools/testing/selftests/rcutorture/bin/kvm-check-branches.sh

diff --git a/tools/testing/selftests/rcutorture/bin/kvm-check-branches.sh b/tools/testing/selftests/rcutorture/bin/kvm-check-branches.sh
new file mode 100755
index 000000000000..6e65c134e5f1
--- /dev/null
+++ b/tools/testing/selftests/rcutorture/bin/kvm-check-branches.sh
@@ -0,0 +1,108 @@
+#!/bin/sh
+# SPDX-License-Identifier: GPL-2.0+
+#
+# Run a group of kvm.sh tests on the specified commits.  This currently
+# unconditionally does three-minute runs on each scenario in CFLIST,
+# taking advantage of all available CPUs and trusting the "make" utility.
+# In the short term, adjustments can be made by editing this script and
+# CFLIST.  If some adjustments appear to have ongoing value, this script
+# might grow some command-line arguments.
+#
+# Usage: kvm-check-branches.sh commit1 commit2..commit3 commit4 ...
+#
+# This script considers its arguments one at a time.  If more elaborate
+# specification of commits is needed, please use "git rev-list" to
+# produce something that this simple script can understand.  The reason
+# for retaining the simplicity is that it allows the user to more easily
+# see which commit came from which branch.
+#
+# This script creates a yyyy.mm.dd-hh.mm.ss-group entry in the "res"
+# directory.  The calls to kvm.sh create the usual entries, but this script
+# moves them under the yyyy.mm.dd-hh.mm.ss-group entry, each in its own
+# directory numbered in run order, that is, "0001", "0002", and so on.
+# For successful runs, the large build artifacts are removed.  Doing this
+# reduces the disk space required by about two orders of magnitude for
+# successful runs.
+#
+# Copyright (C) Facebook, 2020
+#
+# Authors: Paul E. McKenney <paulmck@kernel.org>
+
+if ! git status > /dev/null 2>&1
+then
+	echo '!!!' This script needs to run in a git archive. 1>&2
+	echo '!!!' Giving up. 1>&2
+	exit 1
+fi
+
+# Remember where we started so that we can get back and the end.
+curcommit="`git status | head -1 | awk '{ print $NF }'`"
+
+nfail=0
+ntry=0
+resdir="tools/testing/selftests/rcutorture/res"
+ds="`date +%Y.%m.%d-%H.%M.%S`-group"
+if ! test -e $resdir
+then
+	mkdir $resdir || :
+fi
+mkdir $resdir/$ds
+echo Results directory: $resdir/$ds
+
+KVM="`pwd`/tools/testing/selftests/rcutorture"; export KVM
+PATH=${KVM}/bin:$PATH; export PATH
+. functions.sh
+cpus="`identify_qemu_vcpus`"
+echo Using up to $cpus CPUs.
+
+# Each pass through this loop does one command-line argument.
+for gitbr in $@
+do
+	echo ' --- git branch ' $gitbr
+
+	# Each pass through this loop tests one commit.
+	for i in `git rev-list "$gitbr"`
+	do
+		ntry=`expr $ntry + 1`
+		idir=`awk -v ntry="$ntry" 'END { printf "%04d", ntry; }' < /dev/null`
+		echo ' --- commit ' $i from branch $gitbr
+		date
+		mkdir $resdir/$ds/$idir
+		echo $gitbr > $resdir/$ds/$idir/gitbr
+		echo $i >> $resdir/$ds/$idir/gitbr
+
+		# Test the specified commit.
+		git checkout $i > $resdir/$ds/$idir/git-checkout.out 2>&1
+		echo git checkout return code: $? "(Commit $ntry: $i)"
+		kvm.sh --cpus $cpus --duration 3 --trust-make > $resdir/$ds/$idir/kvm.sh.out 2>&1
+		ret=$?
+		echo kvm.sh return code $ret for commit $i from branch $gitbr
+
+		# Move the build products to their resting place.
+		runresdir="`grep -m 1 '^Results directory:' < $resdir/$ds/$idir/kvm.sh.out | sed -e 's/^Results directory://'`"
+		mv $runresdir $resdir/$ds/$idir
+		rrd="`echo $runresdir | sed -e 's,^.*/,,'`"
+		echo Run results: $resdir/$ds/$idir/$rrd
+		if test "$ret" -ne 0
+		then
+			# Failure, so leave all evidence intact.
+			nfail=`expr $nfail + 1`
+		else
+			# Success, so remove large files to save about 1GB.
+			( cd $resdir/$ds/$idir/$rrd; rm -f */vmlinux */bzImage */System.map */Module.symvers )
+		fi
+	done
+done
+date
+
+# Go back to the original commit.
+git checkout "$curcommit"
+
+if test $nfail -ne 0
+then
+	echo '!!! ' $nfail failures in $ntry 'runs!!!'
+	exit 1
+else
+	echo No failures in $ntry runs.
+	exit 0
+fi

From d02c6b52d12fa30eeabfaf5aefe12078eacb94b2 Mon Sep 17 00:00:00 2001
From: Zou Wei <zou_wei@huawei.com>
Date: Mon, 13 Apr 2020 20:02:59 +0800
Subject: [PATCH 166/502] locktorture: Use true and false to assign to bool
 variables

This commit fixes the following coccicheck warnings:

kernel/locking/locktorture.c:689:6-10: WARNING: Assignment of 0/1 to bool variable
kernel/locking/locktorture.c:907:2-20: WARNING: Assignment of 0/1 to bool variable
kernel/locking/locktorture.c:938:3-20: WARNING: Assignment of 0/1 to bool variable
kernel/locking/locktorture.c:668:2-19: WARNING: Assignment of 0/1 to bool variable
kernel/locking/locktorture.c:674:2-19: WARNING: Assignment of 0/1 to bool variable
kernel/locking/locktorture.c:634:2-20: WARNING: Assignment of 0/1 to bool variable
kernel/locking/locktorture.c:640:2-20: WARNING: Assignment of 0/1 to bool variable

Reported-by: Hulk Robot <hulkci@huawei.com>
Signed-off-by: Zou Wei <zou_wei@huawei.com>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 kernel/locking/locktorture.c | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/kernel/locking/locktorture.c b/kernel/locking/locktorture.c
index 5efbfc68ce99..8ff6f50e06a0 100644
--- a/kernel/locking/locktorture.c
+++ b/kernel/locking/locktorture.c
@@ -631,13 +631,13 @@ static int lock_torture_writer(void *arg)
 		cxt.cur_ops->writelock();
 		if (WARN_ON_ONCE(lock_is_write_held))
 			lwsp->n_lock_fail++;
-		lock_is_write_held = 1;
+		lock_is_write_held = true;
 		if (WARN_ON_ONCE(lock_is_read_held))
 			lwsp->n_lock_fail++; /* rare, but... */
 
 		lwsp->n_lock_acquired++;
 		cxt.cur_ops->write_delay(&rand);
-		lock_is_write_held = 0;
+		lock_is_write_held = false;
 		cxt.cur_ops->writeunlock();
 
 		stutter_wait("lock_torture_writer");
@@ -665,13 +665,13 @@ static int lock_torture_reader(void *arg)
 			schedule_timeout_uninterruptible(1);
 
 		cxt.cur_ops->readlock();
-		lock_is_read_held = 1;
+		lock_is_read_held = true;
 		if (WARN_ON_ONCE(lock_is_write_held))
 			lrsp->n_lock_fail++; /* rare, but... */
 
 		lrsp->n_lock_acquired++;
 		cxt.cur_ops->read_delay(&rand);
-		lock_is_read_held = 0;
+		lock_is_read_held = false;
 		cxt.cur_ops->readunlock();
 
 		stutter_wait("lock_torture_reader");
@@ -686,7 +686,7 @@ static int lock_torture_reader(void *arg)
 static void __torture_print_stats(char *page,
 				  struct lock_stress_stats *statp, bool write)
 {
-	bool fail = 0;
+	bool fail = false;
 	int i, n_stress;
 	long max = 0, min = statp ? statp[0].n_lock_acquired : 0;
 	long long sum = 0;
@@ -904,7 +904,7 @@ static int __init lock_torture_init(void)
 
 	/* Initialize the statistics so that each run gets its own numbers. */
 	if (nwriters_stress) {
-		lock_is_write_held = 0;
+		lock_is_write_held = false;
 		cxt.lwsa = kmalloc_array(cxt.nrealwriters_stress,
 					 sizeof(*cxt.lwsa),
 					 GFP_KERNEL);
@@ -935,7 +935,7 @@ static int __init lock_torture_init(void)
 		}
 
 		if (nreaders_stress) {
-			lock_is_read_held = 0;
+			lock_is_read_held = false;
 			cxt.lrsa = kmalloc_array(cxt.nrealreaders_stress,
 						 sizeof(*cxt.lrsa),
 						 GFP_KERNEL);

From 4a5f133c15b77c4018e8d7996541868ac94afb4f Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Fri, 24 Apr 2020 11:21:40 -0700
Subject: [PATCH 167/502] rcutorture: Add races with task-exit processing

Several variants of Linux-kernel RCU interact with task-exit processing,
including preemptible RCU, Tasks RCU, and Tasks Trace RCU.  This commit
therefore adds testing of this interaction to rcutorture by adding
rcutorture.read_exit_burst and rcutorture.read_exit_delay kernel-boot
parameters.  These kernel parameters control the frequency and spacing
of special read-then-exit kthreads that are spawned.

[ paulmck: Apply feedback from Dan Carpenter's static checker. ]
[ paulmck: Reduce latency to avoid false-positive shutdown hangs. ]
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 .../admin-guide/kernel-parameters.txt         |  14 +++
 include/linux/torture.h                       |   5 +
 kernel/rcu/rcutorture.c                       | 112 +++++++++++++++++-
 3 files changed, 128 insertions(+), 3 deletions(-)

diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index fb95fad81c79..a0dcc925c8a2 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -4258,6 +4258,20 @@
 			Set time (jiffies) between CPU-hotplug operations,
 			or zero to disable CPU-hotplug testing.
 
+	rcutorture.read_exit= [KNL]
+			Set the number of read-then-exit kthreads used
+			to test the interaction of RCU updaters and
+			task-exit processing.
+
+	rcutorture.read_exit_burst= [KNL]
+			The number of times in a given read-then-exit
+			episode that a set of read-then-exit kthreads
+			is spawned.
+
+	rcutorture.read_exit_delay= [KNL]
+			The delay, in seconds, between successive
+			read-then-exit testing episodes.
+
 	rcutorture.shuffle_interval= [KNL]
 			Set task-shuffle interval (s).  Shuffling tasks
 			allows some CPUs to go into dyntick-idle mode
diff --git a/include/linux/torture.h b/include/linux/torture.h
index 629b66e6c161..7f65bd1dd307 100644
--- a/include/linux/torture.h
+++ b/include/linux/torture.h
@@ -55,6 +55,11 @@ struct torture_random_state {
 #define DEFINE_TORTURE_RANDOM_PERCPU(name) \
 	DEFINE_PER_CPU(struct torture_random_state, name)
 unsigned long torture_random(struct torture_random_state *trsp);
+static inline void torture_random_init(struct torture_random_state *trsp)
+{
+	trsp->trs_state = 0;
+	trsp->trs_count = 0;
+}
 
 /* Task shuffler, which causes CPUs to occasionally go idle. */
 void torture_shuffle_task_register(struct task_struct *tp);
diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
index efb792e13fca..2621a339c8a4 100644
--- a/kernel/rcu/rcutorture.c
+++ b/kernel/rcu/rcutorture.c
@@ -109,6 +109,10 @@ torture_param(int, object_debug, 0,
 torture_param(int, onoff_holdoff, 0, "Time after boot before CPU hotplugs (s)");
 torture_param(int, onoff_interval, 0,
 	     "Time between CPU hotplugs (jiffies), 0=disable");
+torture_param(int, read_exit_delay, 13,
+	      "Delay between read-then-exit episodes (s)");
+torture_param(int, read_exit_burst, 16,
+	      "# of read-then-exit bursts per episode, zero to disable");
 torture_param(int, shuffle_interval, 3, "Number of seconds between shuffles");
 torture_param(int, shutdown_secs, 0, "Shutdown time (s), <= zero to disable.");
 torture_param(int, stall_cpu, 0, "Stall duration (s), zero to disable.");
@@ -146,6 +150,7 @@ static struct task_struct *stall_task;
 static struct task_struct *fwd_prog_task;
 static struct task_struct **barrier_cbs_tasks;
 static struct task_struct *barrier_task;
+static struct task_struct *read_exit_task;
 
 #define RCU_TORTURE_PIPE_LEN 10
 
@@ -177,6 +182,7 @@ static long n_rcu_torture_boosts;
 static atomic_long_t n_rcu_torture_timers;
 static long n_barrier_attempts;
 static long n_barrier_successes; /* did rcu_barrier test succeed? */
+static unsigned long n_read_exits;
 static struct list_head rcu_torture_removed;
 static unsigned long shutdown_jiffies;
 
@@ -1539,10 +1545,11 @@ rcu_torture_stats_print(void)
 		n_rcu_torture_boosts,
 		atomic_long_read(&n_rcu_torture_timers));
 	torture_onoff_stats();
-	pr_cont("barrier: %ld/%ld:%ld\n",
+	pr_cont("barrier: %ld/%ld:%ld ",
 		data_race(n_barrier_successes),
 		data_race(n_barrier_attempts),
 		data_race(n_rcu_torture_barrier_error));
+	pr_cont("read-exits: %ld\n", data_race(n_read_exits));
 
 	pr_alert("%s%s ", torture_type, TORTURE_FLAG);
 	if (atomic_read(&n_rcu_torture_mberror) ||
@@ -1634,7 +1641,8 @@ rcu_torture_print_module_parms(struct rcu_torture_ops *cur_ops, const char *tag)
 		 "stall_cpu=%d stall_cpu_holdoff=%d stall_cpu_irqsoff=%d "
 		 "stall_cpu_block=%d "
 		 "n_barrier_cbs=%d "
-		 "onoff_interval=%d onoff_holdoff=%d\n",
+		 "onoff_interval=%d onoff_holdoff=%d "
+		 "read_exit_delay=%d read_exit_burst=%d\n",
 		 torture_type, tag, nrealreaders, nfakewriters,
 		 stat_interval, verbose, test_no_idle_hz, shuffle_interval,
 		 stutter, irqreader, fqs_duration, fqs_holdoff, fqs_stutter,
@@ -1643,7 +1651,8 @@ rcu_torture_print_module_parms(struct rcu_torture_ops *cur_ops, const char *tag)
 		 stall_cpu, stall_cpu_holdoff, stall_cpu_irqsoff,
 		 stall_cpu_block,
 		 n_barrier_cbs,
-		 onoff_interval, onoff_holdoff);
+		 onoff_interval, onoff_holdoff,
+		 read_exit_delay, read_exit_burst);
 }
 
 static int rcutorture_booster_cleanup(unsigned int cpu)
@@ -2338,6 +2347,99 @@ static bool rcu_torture_can_boost(void)
 	return true;
 }
 
+static bool read_exit_child_stop;
+static bool read_exit_child_stopped;
+static wait_queue_head_t read_exit_wq;
+
+// Child kthread which just does an rcutorture reader and exits.
+static int rcu_torture_read_exit_child(void *trsp_in)
+{
+	struct torture_random_state *trsp = trsp_in;
+
+	set_user_nice(current, MAX_NICE);
+	// Minimize time between reading and exiting.
+	while (!kthread_should_stop())
+		schedule_timeout_uninterruptible(1);
+	(void)rcu_torture_one_read(trsp);
+	return 0;
+}
+
+// Parent kthread which creates and destroys read-exit child kthreads.
+static int rcu_torture_read_exit(void *unused)
+{
+	int count = 0;
+	bool errexit = false;
+	int i;
+	struct task_struct *tsp;
+	DEFINE_TORTURE_RANDOM(trs);
+
+	// Allocate and initialize.
+	set_user_nice(current, MAX_NICE);
+	VERBOSE_TOROUT_STRING("rcu_torture_read_exit: Start of test");
+
+	// Each pass through this loop does one read-exit episode.
+	do {
+		if (++count > read_exit_burst) {
+			VERBOSE_TOROUT_STRING("rcu_torture_read_exit: End of episode");
+			rcu_barrier(); // Wait for task_struct free, avoid OOM.
+			for (i = 0; i < read_exit_delay; i++) {
+				schedule_timeout_uninterruptible(HZ);
+				if (READ_ONCE(read_exit_child_stop))
+					break;
+			}
+			if (!READ_ONCE(read_exit_child_stop))
+				VERBOSE_TOROUT_STRING("rcu_torture_read_exit: Start of episode");
+			count = 0;
+		}
+		if (READ_ONCE(read_exit_child_stop))
+			break;
+		// Spawn child.
+		tsp = kthread_run(rcu_torture_read_exit_child,
+				     &trs, "%s",
+				     "rcu_torture_read_exit_child");
+		if (IS_ERR(tsp)) {
+			VERBOSE_TOROUT_ERRSTRING("out of memory");
+			errexit = true;
+			tsp = NULL;
+			break;
+		}
+		cond_resched();
+		kthread_stop(tsp);
+		n_read_exits ++;
+		stutter_wait("rcu_torture_read_exit");
+	} while (!errexit && !READ_ONCE(read_exit_child_stop));
+
+	// Clean up and exit.
+	smp_store_release(&read_exit_child_stopped, true); // After reaping.
+	smp_mb(); // Store before wakeup.
+	wake_up(&read_exit_wq);
+	while (!torture_must_stop())
+		schedule_timeout_uninterruptible(1);
+	torture_kthread_stopping("rcu_torture_read_exit");
+	return 0;
+}
+
+static int rcu_torture_read_exit_init(void)
+{
+	if (read_exit_burst <= 0)
+		return -EINVAL;
+	init_waitqueue_head(&read_exit_wq);
+	read_exit_child_stop = false;
+	read_exit_child_stopped = false;
+	return torture_create_kthread(rcu_torture_read_exit, NULL,
+				      read_exit_task);
+}
+
+static void rcu_torture_read_exit_cleanup(void)
+{
+	if (!read_exit_task)
+		return;
+	WRITE_ONCE(read_exit_child_stop, true);
+	smp_mb(); // Above write before wait.
+	wait_event(read_exit_wq, smp_load_acquire(&read_exit_child_stopped));
+	torture_stop_kthread(rcutorture_read_exit, read_exit_task);
+}
+
 static enum cpuhp_state rcutor_hp;
 
 static void
@@ -2359,6 +2461,7 @@ rcu_torture_cleanup(void)
 	}
 
 	show_rcu_gp_kthreads();
+	rcu_torture_read_exit_cleanup();
 	rcu_torture_barrier_cleanup();
 	torture_stop_kthread(rcu_torture_fwd_prog, fwd_prog_task);
 	torture_stop_kthread(rcu_torture_stall, stall_task);
@@ -2680,6 +2783,9 @@ rcu_torture_init(void)
 	if (firsterr)
 		goto unwind;
 	firsterr = rcu_torture_barrier_init();
+	if (firsterr)
+		goto unwind;
+	firsterr = rcu_torture_read_exit_init();
 	if (firsterr)
 		goto unwind;
 	if (object_debug)

From 61251d6899803594a108c3165aeb072c73e09cc8 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Sun, 26 Apr 2020 16:48:46 -0700
Subject: [PATCH 168/502] torture: Set configfile variable to current scenario

The torture-test recheck logic fails to set the configfile variable to
the current scenario, so this commit properly initializes this variable.
This change isn't critical given that all errors for a given scenario
follow that scenario's heading, but it is easier on the eyes to repeat it.
And this repetition also prevents confusion as to whether a given message
goes with the previous heading or the next one.

Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 tools/testing/selftests/rcutorture/bin/kvm-recheck.sh | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tools/testing/selftests/rcutorture/bin/kvm-recheck.sh b/tools/testing/selftests/rcutorture/bin/kvm-recheck.sh
index 736f04749b90..2261aa676304 100755
--- a/tools/testing/selftests/rcutorture/bin/kvm-recheck.sh
+++ b/tools/testing/selftests/rcutorture/bin/kvm-recheck.sh
@@ -31,6 +31,7 @@ do
 			head -1 $resdir/log
 		fi
 		TORTURE_SUITE="`cat $i/../TORTURE_SUITE`"
+		configfile=`echo $i | sed -e 's,^.*/,,'`
 		rm -f $i/console.log.*.diags
 		kvm-recheck-${TORTURE_SUITE}.sh $i
 		if test -f "$i/qemu-retval" && test "`cat $i/qemu-retval`" -ne 0 && test "`cat $i/qemu-retval`" -ne 137

From 59359e4f2a0906920389ec1e33296ac9a19178ba Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Sun, 26 Apr 2020 16:51:56 -0700
Subject: [PATCH 169/502] rcutorture: Handle non-statistic bang-string error
 messages

The current console parsing assumes that console lines containing "!!!"
are statistics lines from which it can parse the number of rcutorture
too-short grace-period failures.  This prints confusing output for
other problems, including memory exhaustion.  This commit therefore
differentiates between these cases and prints an appropriate error string.

Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 .../selftests/rcutorture/bin/parse-console.sh  | 18 +++++++++++++++---
 1 file changed, 15 insertions(+), 3 deletions(-)

diff --git a/tools/testing/selftests/rcutorture/bin/parse-console.sh b/tools/testing/selftests/rcutorture/bin/parse-console.sh
index 4bf62d7b1cbc..1c64ca85438c 100755
--- a/tools/testing/selftests/rcutorture/bin/parse-console.sh
+++ b/tools/testing/selftests/rcutorture/bin/parse-console.sh
@@ -44,11 +44,23 @@ then
 		tail -1 |
 		awk '
 		{
-			for (i=NF-8;i<=NF;i++)
+			normalexit = 1;
+			for (i=NF-8;i<=NF;i++) {
+				if (i <= 0 || i !~ /^[0-9]*$/) {
+					bangstring = $0;
+					gsub(/^\[[^]]*] /, "", bangstring);
+					print bangstring;
+					normalexit = 0;
+					exit 0;
+				}
 				sum+=$i;
+			}
 		}
-		END { print sum }'`
-		print_bug $title FAILURE, $nerrs instances
+		END {
+			if (normalexit)
+				print sum " instances"
+		}'`
+		print_bug $title FAILURE, $nerrs
 		exit
 	fi
 

From cae7cc6ba5bad320c2055ac54f73affd051e76ca Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Sun, 26 Apr 2020 19:20:37 -0700
Subject: [PATCH 170/502] rcutorture: NULL rcu_torture_current earlier in
 cleanup code

Currently, the rcu_torture_current variable remains non-NULL until after
all readers have stopped.  During this time, rcu_torture_stats_print()
will think that the test is still ongoing, which can result in confusing
dmesg output.  This commit therefore NULLs rcu_torture_current immediately
after the rcu_torture_writer() kthread has decided to stop, thus informing
rcu_torture_stats_print() much sooner.

Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 kernel/rcu/rcutorture.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
index 2621a339c8a4..59112077a6da 100644
--- a/kernel/rcu/rcutorture.c
+++ b/kernel/rcu/rcutorture.c
@@ -1172,6 +1172,7 @@ rcu_torture_writer(void *arg)
 					WARN(1, "%s: rtort_pipe_count: %d\n", __func__, rcu_tortures[i].rtort_pipe_count);
 				}
 	} while (!torture_must_stop());
+	rcu_torture_current = NULL;  // Let stats task know that we are done.
 	/* Reset expediting back to unexpedited. */
 	if (expediting > 0)
 		expediting = -expediting;
@@ -2473,7 +2474,6 @@ rcu_torture_cleanup(void)
 					     reader_tasks[i]);
 		kfree(reader_tasks);
 	}
-	rcu_torture_current = NULL;
 
 	if (fakewriter_tasks) {
 		for (i = 0; i < nfakewriters; i++) {

From d3cb26312ecfdb4ee8dedf931e24e60df1d7fbc9 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Mon, 4 May 2020 16:40:53 -0700
Subject: [PATCH 171/502] torture: Remove whitespace from identify_qemu_vcpus
 output

The identify_qemu_vcpus bash function can return numbers including
whitespace characters, which can be a bit annoying in some bash
dollar-sign substitutions.  This commit therefore strips all spaces and
tabs from the value that identify_qemu_vcpus outputs.

Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 tools/testing/selftests/rcutorture/bin/functions.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/testing/selftests/rcutorture/bin/functions.sh b/tools/testing/selftests/rcutorture/bin/functions.sh
index 436b1542cf27..51f3464b96d3 100644
--- a/tools/testing/selftests/rcutorture/bin/functions.sh
+++ b/tools/testing/selftests/rcutorture/bin/functions.sh
@@ -231,7 +231,7 @@ identify_qemu_args () {
 # Returns the number of virtual CPUs available to the aggregate of the
 # guest OSes.
 identify_qemu_vcpus () {
-	lscpu | grep '^CPU(s):' | sed -e 's/CPU(s)://'
+	lscpu | grep '^CPU(s):' | sed -e 's/CPU(s)://' -e 's/[ 	]*//g'
 }
 
 # print_bug

From a3ba4972f2ef8408dcc8a2a3d433621d6c990594 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Mon, 4 May 2020 16:41:53 -0700
Subject: [PATCH 172/502] torture: Add --allcpus argument to the kvm.sh script

Leaving off the kvm.sh script's --cpus argument results in the script
testing the scenarios sequentially, which can be quite slow.  However,
having to specify the actual number of CPUs can be error-prone.
This commit therefore adds a --allcpus argument that causes kvm.sh to
use all available CPUs.

Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 tools/testing/selftests/rcutorture/bin/kvm.sh | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/tools/testing/selftests/rcutorture/bin/kvm.sh b/tools/testing/selftests/rcutorture/bin/kvm.sh
index c279cf9cb010..7dbce7a43413 100755
--- a/tools/testing/selftests/rcutorture/bin/kvm.sh
+++ b/tools/testing/selftests/rcutorture/bin/kvm.sh
@@ -73,6 +73,10 @@ usage () {
 while test $# -gt 0
 do
 	case "$1" in
+	--allcpus)
+		cpus=$TORTURE_ALLOTED_CPUS
+		max_cpus=$TORTURE_ALLOTED_CPUS
+		;;
 	--bootargs|--bootarg)
 		checkarg --bootargs "(list of kernel boot arguments)" "$#" "$2" '.*' '^--'
 		TORTURE_BOOTARGS="$2"

From 8f43d5911b38f00dfa46169dcb1feb1e101dd906 Mon Sep 17 00:00:00 2001
From: Jules Irenge <jbi.octave@gmail.com>
Date: Mon, 1 Jun 2020 19:45:48 +0100
Subject: [PATCH 173/502] rcu/rcutorture: Replace 0 with false

Coccinelle reports a warning

WARNING: Assignment of 0/1 to bool variable

The root cause is that the variable lastphase is a bool, but is
initialised with integer 0.  This commit therefore replaces the 0 with
a false.

Signed-off-by: Jules Irenge <jbi.octave@gmail.com>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 kernel/rcu/rcutorture.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
index 59112077a6da..37455a12898e 100644
--- a/kernel/rcu/rcutorture.c
+++ b/kernel/rcu/rcutorture.c
@@ -2185,7 +2185,7 @@ static void rcu_torture_barrier1cb(void *rcu_void)
 static int rcu_torture_barrier_cbs(void *arg)
 {
 	long myid = (long)arg;
-	bool lastphase = 0;
+	bool lastphase = false;
 	bool newphase;
 	struct rcu_head rcu;
 

From 3e93a51f191aa710760591961240f8910d952b5b Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Fri, 5 Jun 2020 10:29:28 -0700
Subject: [PATCH 174/502] torture: Create qemu-cmd in --buildonly runs

One reason to do a --buildonly run is to use the build products elsewhere,
for example, to do the actual test on some other system.  Part of doing
the test is the actual qemu command, which is not currently produced
by --buildonly runs.  This commit therefore causes --buildonly runs to
create this file.

Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh b/tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh
index 1b9aebd54cc9..064dd735de39 100755
--- a/tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh
+++ b/tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh
@@ -153,6 +153,7 @@ qemu_append="`identify_qemu_append "$QEMU"`"
 boot_args="`configfrag_boot_params "$boot_args" "$config_template"`"
 # Generate kernel-version-specific boot parameters
 boot_args="`per_version_boot_params "$boot_args" $resdir/.config $seconds`"
+echo $QEMU $qemu_args -m $TORTURE_QEMU_MEM -kernel $KERNEL -append \"$qemu_append $boot_args\" > $resdir/qemu-cmd
 
 if test -n "$TORTURE_BUILDONLY"
 then
@@ -161,7 +162,6 @@ then
 	exit 0
 fi
 echo "NOTE: $QEMU either did not run or was interactive" > $resdir/console.log
-echo $QEMU $qemu_args -m $TORTURE_QEMU_MEM -kernel $KERNEL -append \"$qemu_append $boot_args\" > $resdir/qemu-cmd
 ( $QEMU $qemu_args -m $TORTURE_QEMU_MEM -kernel $KERNEL -append "$qemu_append $boot_args" > $resdir/qemu-output 2>&1 & echo $! > $resdir/qemu_pid; wait `cat  $resdir/qemu_pid`; echo $? > $resdir/qemu-retval ) &
 commandcompleted=0
 sleep 10 # Give qemu's pid a chance to reach the file

From 6387ecbc94bf5ac07239104b84d2304da6e79b51 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Tue, 9 Jun 2020 17:58:30 -0700
Subject: [PATCH 175/502] torture: Add a stop-run capability

When bisecting RCU issues, it is often the case that the first error in
an unsuccessful run will happen quickly, but that a successful run must
go on for some time in order to obtain a sufficiently low false-negative
error rate.  In many cases, a bisection requires multiple concurrent
runs, in which case the first failure in any run indicates failure,
pure and simple.  In such cases, it would speed things up greatly if
the first failure terminated all runs.

This commit therefore adds scripting that checks for a file named "STOP"
in the top-level results directory, terminating the run when it appears.
Note that in-progress builds will continue until completion, but future
builds and all runs will be cut short.

Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 tools/testing/selftests/rcutorture/bin/jitter.sh    |  6 ++++++
 tools/testing/selftests/rcutorture/bin/kvm-build.sh |  6 ++++++
 .../selftests/rcutorture/bin/kvm-test-1-run.sh      | 13 +++++++++++--
 tools/testing/selftests/rcutorture/bin/kvm.sh       |  2 ++
 4 files changed, 25 insertions(+), 2 deletions(-)

diff --git a/tools/testing/selftests/rcutorture/bin/jitter.sh b/tools/testing/selftests/rcutorture/bin/jitter.sh
index 30cb5b27d32e..188b864bc4bf 100755
--- a/tools/testing/selftests/rcutorture/bin/jitter.sh
+++ b/tools/testing/selftests/rcutorture/bin/jitter.sh
@@ -46,6 +46,12 @@ do
 		exit 0;
 	fi
 
+	# Check for stop request.
+	if test -f "$TORTURE_STOPFILE"
+	then
+		exit 1;
+	fi
+
 	# Set affinity to randomly selected online CPU
 	if cpus=`grep 1 /sys/devices/system/cpu/*/online 2>&1 |
 		 sed -e 's,/[^/]*$,,' -e 's/^[^0-9]*//'`
diff --git a/tools/testing/selftests/rcutorture/bin/kvm-build.sh b/tools/testing/selftests/rcutorture/bin/kvm-build.sh
index 18d6518504ee..115e1822b26f 100755
--- a/tools/testing/selftests/rcutorture/bin/kvm-build.sh
+++ b/tools/testing/selftests/rcutorture/bin/kvm-build.sh
@@ -9,6 +9,12 @@
 #
 # Authors: Paul E. McKenney <paulmck@linux.ibm.com>
 
+if test -f "$TORTURE_STOPFILE"
+then
+	echo "kvm-build.sh early exit due to run STOP request"
+	exit 1
+fi
+
 config_template=${1}
 if test -z "$config_template" -o ! -f "$config_template" -o ! -r "$config_template"
 then
diff --git a/tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh b/tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh
index 064dd735de39..5ec095da095f 100755
--- a/tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh
+++ b/tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh
@@ -182,7 +182,7 @@ do
 	kruntime=`gawk 'BEGIN { print systime() - '"$kstarttime"' }' < /dev/null`
 	if test -z "$qemu_pid" || kill -0 "$qemu_pid" > /dev/null 2>&1
 	then
-		if test $kruntime -ge $seconds
+		if test $kruntime -ge $seconds -o -f "$TORTURE_STOPFILE"
 		then
 			break;
 		fi
@@ -211,10 +211,19 @@ then
 fi
 if test $commandcompleted -eq 0 -a -n "$qemu_pid"
 then
-	echo Grace period for qemu job at pid $qemu_pid
+	if ! test -f "$TORTURE_STOPFILE"
+	then
+		echo Grace period for qemu job at pid $qemu_pid
+	fi
 	oldline="`tail $resdir/console.log`"
 	while :
 	do
+		if test -f "$TORTURE_STOPFILE"
+		then
+			echo "PID $qemu_pid killed due to run STOP request" >> $resdir/Warnings 2>&1
+			kill -KILL $qemu_pid
+			break
+		fi
 		kruntime=`gawk 'BEGIN { print systime() - '"$kstarttime"' }' < /dev/null`
 		if kill -0 $qemu_pid > /dev/null 2>&1
 		then
diff --git a/tools/testing/selftests/rcutorture/bin/kvm.sh b/tools/testing/selftests/rcutorture/bin/kvm.sh
index 7dbce7a43413..3578c85ea8c4 100755
--- a/tools/testing/selftests/rcutorture/bin/kvm.sh
+++ b/tools/testing/selftests/rcutorture/bin/kvm.sh
@@ -337,6 +337,8 @@ then
 	mkdir -p "$resdir" || :
 fi
 mkdir $resdir/$ds
+TORTURE_RESDIR="$resdir/$ds"; export TORTURE_RESDIR
+TORTURE_STOPFILE="$resdir/$ds/STOP"; export TORTURE_STOPFILE
 echo Results directory: $resdir/$ds
 echo $scriptname $args
 touch $resdir/$ds/log

From bc77a72cd188d44881ee1b9d0a9d65ca8108b508 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Wed, 10 Jun 2020 14:08:19 -0700
Subject: [PATCH 176/502] torture: Abstract out console-log error detection

This commit pulls the simple pattern-based error detection from the
console log into a new console-badness.sh file.  This will enable future
commits to end a run on the first error.

Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 .../selftests/rcutorture/bin/console-badness.sh  | 16 ++++++++++++++++
 .../selftests/rcutorture/bin/parse-console.sh    |  5 +----
 2 files changed, 17 insertions(+), 4 deletions(-)
 create mode 100755 tools/testing/selftests/rcutorture/bin/console-badness.sh

diff --git a/tools/testing/selftests/rcutorture/bin/console-badness.sh b/tools/testing/selftests/rcutorture/bin/console-badness.sh
new file mode 100755
index 000000000000..0e4c0b2eb7f0
--- /dev/null
+++ b/tools/testing/selftests/rcutorture/bin/console-badness.sh
@@ -0,0 +1,16 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0+
+#
+# Scan standard input for error messages, dumping any found to standard
+# output.
+#
+# Usage: console-badness.sh
+#
+# Copyright (C) 2020 Facebook, Inc.
+#
+# Authors: Paul E. McKenney <paulmck@kernel.org>
+
+egrep 'Badness|WARNING:|Warn|BUG|===========|Call Trace:|Oops:|detected stalls on CPUs/tasks:|self-detected stall on CPU|Stall ended before state dump start|\?\?\? Writer stall state|rcu_.*kthread starved for|!!!' |
+grep -v 'ODEBUG: ' |
+grep -v 'This means that this is a DEBUG kernel and it is' |
+grep -v 'Warning: unable to open an initial console'
diff --git a/tools/testing/selftests/rcutorture/bin/parse-console.sh b/tools/testing/selftests/rcutorture/bin/parse-console.sh
index 1c64ca85438c..98478e12ac3d 100755
--- a/tools/testing/selftests/rcutorture/bin/parse-console.sh
+++ b/tools/testing/selftests/rcutorture/bin/parse-console.sh
@@ -116,10 +116,7 @@ then
 	fi
 fi | tee -a $file.diags
 
-egrep 'Badness|WARNING:|Warn|BUG|===========|Call Trace:|Oops:|detected stalls on CPUs/tasks:|self-detected stall on CPU|Stall ended before state dump start|\?\?\? Writer stall state|rcu_.*kthread starved for' < $file |
-grep -v 'ODEBUG: ' |
-grep -v 'This means that this is a DEBUG kernel and it is' |
-grep -v 'Warning: unable to open an initial console' > $T.diags
+console-badness.sh < $file > $T.diags
 if test -s $T.diags
 then
 	print_warning "Assertion failure in $file $title"

From 775227511843202e65a7f194cbf64f38de01f004 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Thu, 11 Jun 2020 16:43:14 -0700
Subject: [PATCH 177/502] rcutorture: Check for unwatched readers

RCU is supposed to be watching all non-idle kernel code and also all
softirq handlers.  This commit adds some teeth to this statement by
adding a WARN_ON_ONCE().

Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 kernel/rcu/rcutorture.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
index 37455a12898e..9c310016585b 100644
--- a/kernel/rcu/rcutorture.c
+++ b/kernel/rcu/rcutorture.c
@@ -1377,6 +1377,7 @@ static bool rcu_torture_one_read(struct torture_random_state *trsp)
 	struct rt_read_seg *rtrsp1;
 	unsigned long long ts;
 
+	WARN_ON_ONCE(!rcu_is_watching());
 	newstate = rcutorture_extend_mask(readstate, trsp);
 	rcutorture_one_extend(&readstate, newstate, trsp, rtrsp++);
 	started = cur_ops->get_gp_seq();

From 603d11ad6976e1289f19c2a19e2f75a83d0dc296 Mon Sep 17 00:00:00 2001
From: Marco Elver <elver@google.com>
Date: Tue, 16 Jun 2020 11:49:24 +0200
Subject: [PATCH 178/502] torture: Pass --kmake-arg to all make invocations

We need to pass the arguments provided to --kmake-arg to all make
invocations. In particular, the make invocations generating the configs
need to see the final make arguments, e.g. if config variables depend on
particular variables that are passed to make.

For example, when using '--kcsan --kmake-arg CC=clang-11', we would lose
CONFIG_KCSAN=y due to 'make oldconfig' not seeing that we want to use a
compiler that supports KCSAN.

Signed-off-by: Marco Elver <elver@google.com>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 tools/testing/selftests/rcutorture/bin/configinit.sh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tools/testing/selftests/rcutorture/bin/configinit.sh b/tools/testing/selftests/rcutorture/bin/configinit.sh
index 93e80a42249a..d6e5ce084b1c 100755
--- a/tools/testing/selftests/rcutorture/bin/configinit.sh
+++ b/tools/testing/selftests/rcutorture/bin/configinit.sh
@@ -32,11 +32,11 @@ if test -z "$TORTURE_TRUST_MAKE"
 then
 	make clean > $resdir/Make.clean 2>&1
 fi
-make $TORTURE_DEFCONFIG > $resdir/Make.defconfig.out 2>&1
+make $TORTURE_KMAKE_ARG $TORTURE_DEFCONFIG > $resdir/Make.defconfig.out 2>&1
 mv .config .config.sav
 sh $T/upd.sh < .config.sav > .config
 cp .config .config.new
-yes '' | make oldconfig > $resdir/Make.oldconfig.out 2> $resdir/Make.oldconfig.err
+yes '' | make $TORTURE_KMAKE_ARG oldconfig > $resdir/Make.oldconfig.out 2> $resdir/Make.oldconfig.err
 
 # verify new config matches specification.
 configcheck.sh .config $c

From 6bcaf2a0876633b6a7c5e70ee88801e16280210a Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Tue, 16 Jun 2020 10:02:54 -0700
Subject: [PATCH 179/502] torture: Correctly summarize build-only runs

Currently, kvm-recheck.sh complains that qemu failed for --buildonly
runs, which is sort of true given that qemu can hardly succeed if not
invoked in the first place.  Nevertheless, this commit swaps the order
of checks in kvm-recheck.sh so that --buildonly runs will be summarized
more straightforwardly.

Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 tools/testing/selftests/rcutorture/bin/kvm-recheck.sh | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/tools/testing/selftests/rcutorture/bin/kvm-recheck.sh b/tools/testing/selftests/rcutorture/bin/kvm-recheck.sh
index 2261aa676304..357899cfe249 100755
--- a/tools/testing/selftests/rcutorture/bin/kvm-recheck.sh
+++ b/tools/testing/selftests/rcutorture/bin/kvm-recheck.sh
@@ -56,15 +56,15 @@ do
 				cat $i/Warnings
 			fi
 		else
-			if test -f "$i/qemu-cmd"
-			then
-				print_bug qemu failed
-				echo "   $i"
-			elif test -f "$i/buildonly"
+			if test -f "$i/buildonly"
 			then
 				echo Build-only run, no boot/test
 				configcheck.sh $i/.config $i/ConfigFragment
 				parse-build.sh $i/Make.out $configfile
+			elif test -f "$i/qemu-cmd"
+			then
+				print_bug qemu failed
+				echo "   $i"
 			else
 				print_bug Build failed
 				echo "   $i"

From 61b77be09e29e6dc152b1984691e5b1708e8a6ac Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Tue, 16 Jun 2020 10:38:57 -0700
Subject: [PATCH 180/502] torture: Improve diagnostic for KCSAN-incapable
 compilers

Using --kcsan when the compiler does not support KCSAN results in this:

:CONFIG_KCSAN=y: improperly set
:CONFIG_KCSAN_REPORT_ONCE_IN_MS=100000: improperly set
:CONFIG_KCSAN_VERBOSE=y: improperly set
:CONFIG_KCSAN_INTERRUPT_WATCHER=y: improperly set
Clean KCSAN run in /home/git/linux-rcu/tools/testing/selftests/rcutorture/res/2020.06.16-09.53.16

This is a bit obtuse, so this commit adds checks resulting in this:

:CONFIG_KCSAN=y: improperly set
:CONFIG_KCSAN_REPORT_ONCE_IN_MS=100000: improperly set
:CONFIG_KCSAN_VERBOSE=y: improperly set
:CONFIG_KCSAN_INTERRUPT_WATCHER=y: improperly set
Compiler or architecture does not support KCSAN!
Did you forget to switch your compiler with --kmake-arg CC=<cc-that-supports-kcsan>?

Suggested-by: Marco Elver <elver@google.com>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
Acked-by: Marco Elver <elver@google.com>
---
 tools/testing/selftests/rcutorture/bin/kvm-recheck.sh | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/tools/testing/selftests/rcutorture/bin/kvm-recheck.sh b/tools/testing/selftests/rcutorture/bin/kvm-recheck.sh
index 357899cfe249..840a4679a0d7 100755
--- a/tools/testing/selftests/rcutorture/bin/kvm-recheck.sh
+++ b/tools/testing/selftests/rcutorture/bin/kvm-recheck.sh
@@ -44,7 +44,8 @@ do
 			then
 				echo QEMU killed
 			fi
-			configcheck.sh $i/.config $i/ConfigFragment
+			configcheck.sh $i/.config $i/ConfigFragment > $T 2>&1
+			cat $T
 			if test -r $i/Make.oldconfig.err
 			then
 				cat $i/Make.oldconfig.err
@@ -73,7 +74,11 @@ do
 	done
 	if test -f "$rd/kcsan.sum"
 	then
-		if test -s "$rd/kcsan.sum"
+		if grep -q CONFIG_KCSAN=y $T
+		then
+			echo "Compiler or architecture does not support KCSAN!"
+			echo Did you forget to switch your compiler with '--kmake-arg CC=<cc-that-supports-kcsan>'?
+		elif test -s "$rd/kcsan.sum"
 		then
 			echo KCSAN summary in $rd/kcsan.sum
 		else

From 9ccba350bd824ecacbfd8965f4f3ac980b96f951 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Tue, 16 Jun 2020 11:16:18 -0700
Subject: [PATCH 181/502] torture: Add more tracing crib notes to kvm.sh

This commit adds a few more hints about how to use tracing as comments
at the end of kvm.sh.

Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 tools/testing/selftests/rcutorture/bin/kvm.sh | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/tools/testing/selftests/rcutorture/bin/kvm.sh b/tools/testing/selftests/rcutorture/bin/kvm.sh
index 3578c85ea8c4..bdfa0c076ae6 100755
--- a/tools/testing/selftests/rcutorture/bin/kvm.sh
+++ b/tools/testing/selftests/rcutorture/bin/kvm.sh
@@ -503,3 +503,7 @@ fi
 # Tracing: trace_event=rcu:rcu_grace_period,rcu:rcu_future_grace_period,rcu:rcu_grace_period_init,rcu:rcu_nocb_wake,rcu:rcu_preempt_task,rcu:rcu_unlock_preempted_task,rcu:rcu_quiescent_state_report,rcu:rcu_fqs,rcu:rcu_callback,rcu:rcu_kfree_callback,rcu:rcu_batch_start,rcu:rcu_invoke_callback,rcu:rcu_invoke_kfree_callback,rcu:rcu_batch_end,rcu:rcu_torture_read,rcu:rcu_barrier
 # Function-graph tracing: ftrace=function_graph ftrace_graph_filter=sched_setaffinity,migration_cpu_stop
 # Also --kconfig "CONFIG_FUNCTION_TRACER=y CONFIG_FUNCTION_GRAPH_TRACER=y"
+# Control buffer size: --bootargs trace_buf_size=3k
+# Get trace-buffer dumps on all oopses: --bootargs ftrace_dump_on_oops
+# Ditto, but dump only the oopsing CPU: --bootargs ftrace_dump_on_oops=orig_cpu
+# Heavy-handed way to also dump on warnings: --bootargs panic_on_warn

From 06efa9b4b27f926eeb8c935f430f8557eb8b106e Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Tue, 16 Jun 2020 14:14:09 -0700
Subject: [PATCH 182/502] torture: Add kvm-tranform.sh script for qemu-cmd
 files

This commit adds a script that transforms qemu-cmd files to allow them
and the corresponding kernels to be run in contexts other than the one
that they were created for, including on systems other than the one that
they were built on.  For example, this allows the build products from a
--buildonly run to be transformed to allow distributed rcutorture testing.

Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 .../selftests/rcutorture/bin/kvm-transform.sh | 51 +++++++++++++++++++
 1 file changed, 51 insertions(+)
 create mode 100755 tools/testing/selftests/rcutorture/bin/kvm-transform.sh

diff --git a/tools/testing/selftests/rcutorture/bin/kvm-transform.sh b/tools/testing/selftests/rcutorture/bin/kvm-transform.sh
new file mode 100755
index 000000000000..c45a953ef393
--- /dev/null
+++ b/tools/testing/selftests/rcutorture/bin/kvm-transform.sh
@@ -0,0 +1,51 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0+
+#
+# Transform a qemu-cmd file to allow reuse.
+#
+# Usage: kvm-transform.sh bzImage console.log < qemu-cmd-in > qemu-cmd-out
+#
+#	bzImage: Kernel and initrd from the same prior kvm.sh run.
+#	console.log: File into which to place console output.
+#
+# The original qemu-cmd file is provided on standard input.
+# The transformed qemu-cmd file is on standard output.
+# The transformation assumes that the qemu command is confined to a
+# single line.  It also assumes no whitespace in filenames.
+#
+# Copyright (C) 2020 Facebook, Inc.
+#
+# Authors: Paul E. McKenney <paulmck@kernel.org>
+
+image="$1"
+if test -z "$image"
+then
+	echo Need kernel image file.
+	exit 1
+fi
+consolelog="$2"
+if test -z "$consolelog"
+then
+	echo "Need console log file name."
+	exit 1
+fi
+
+awk -v image="$image" -v consolelog="$consolelog" '
+{
+	line = "";
+	for (i = 1; i <= NF; i++) {
+		if (line == "")
+			line = $i;
+		else
+			line = line " " $i;
+		if ($i == "-serial") {
+			i++;
+			line = line " file:" consolelog;
+		}
+		if ($i == "-kernel") {
+			i++;
+			line = line " " image;
+		}
+	}
+	print line;
+}'

From 2102ad290af06119ccfb56ddc3a0e5011a91537e Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Tue, 16 Jun 2020 15:38:24 -0700
Subject: [PATCH 183/502] torture: Dump ftrace at shutdown only if requested

If there is a large number of torture tests running concurrently,
all of which are dumping large ftrace buffers at shutdown time, the
resulting dumping can take a very long time, particularly on systems
with rotating-rust storage.  This commit therefore adds a default-off
torture.ftrace_dump_at_shutdown module parameter that enables
shutdown-time ftrace-buffer dumping.

Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 Documentation/admin-guide/kernel-parameters.txt | 7 +++++++
 kernel/torture.c                                | 6 +++++-
 2 files changed, 12 insertions(+), 1 deletion(-)

diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index a0dcc925c8a2..9f11ff80d4ad 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -5096,6 +5096,13 @@
 			Prevent the CPU-hotplug component of torturing
 			until after init has spawned.
 
+	torture.ftrace_dump_at_shutdown= [KNL]
+			Dump the ftrace buffer at torture-test shutdown,
+			even if there were no errors.  This can be a
+			very costly operation when many torture tests
+			are running concurrently, especially on systems
+			with rotating-rust storage.
+
 	tp720=		[HW,PS2]
 
 	tpm_suspend_pcr=[HW,TPM]
diff --git a/kernel/torture.c b/kernel/torture.c
index a1a41484ff6d..1061492f14bd 100644
--- a/kernel/torture.c
+++ b/kernel/torture.c
@@ -45,6 +45,9 @@ MODULE_AUTHOR("Paul E. McKenney <paulmck@linux.ibm.com>");
 static bool disable_onoff_at_boot;
 module_param(disable_onoff_at_boot, bool, 0444);
 
+static bool ftrace_dump_at_shutdown;
+module_param(ftrace_dump_at_shutdown, bool, 0444);
+
 static char *torture_type;
 static int verbose;
 
@@ -527,7 +530,8 @@ static int torture_shutdown(void *arg)
 		torture_shutdown_hook();
 	else
 		VERBOSE_TOROUT_STRING("No torture_shutdown_hook(), skipping.");
-	rcu_ftrace_dump(DUMP_ALL);
+	if (ftrace_dump_at_shutdown)
+		rcu_ftrace_dump(DUMP_ALL);
 	kernel_power_off();	/* Shut down the system. */
 	return 0;
 }

From 316db5897ee5d7408f2adea4d5992ed380316928 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Tue, 16 Jun 2020 16:34:52 -0700
Subject: [PATCH 184/502] torture: Avoid duplicate specification of qemu
 command

Currently, the qemu command is constructed twice, once to dump it
to the qemu-cmd file and again to execute it.  This is of course an
accident waiting to happen, but is done to ensure that the remainder
of the script has an accurate idea of the running qemu command's PID.
This commit therefore places both the qemu command and the PID capture
into a new temporary file and sources that temporary file.  Thus the
single construction of the qemu command into the qemu-cmd file suffices
for both purposes.

Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 .../testing/selftests/rcutorture/bin/kvm-test-1-run.sh | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh b/tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh
index 5ec095da095f..484445bd3010 100755
--- a/tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh
+++ b/tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh
@@ -161,8 +161,16 @@ then
 	touch $resdir/buildonly
 	exit 0
 fi
+
+# Decorate qemu-cmd with redirection, backgrounding, and PID capture
+sed -e 's/$/ 2>\&1 \&/' < $resdir/qemu-cmd > $T/qemu-cmd
+echo 'echo $! > $resdir/qemu_pid' >> $T/qemu-cmd
+
+# In case qemu refuses to run...
 echo "NOTE: $QEMU either did not run or was interactive" > $resdir/console.log
-( $QEMU $qemu_args -m $TORTURE_QEMU_MEM -kernel $KERNEL -append "$qemu_append $boot_args" > $resdir/qemu-output 2>&1 & echo $! > $resdir/qemu_pid; wait `cat  $resdir/qemu_pid`; echo $? > $resdir/qemu-retval ) &
+
+# Attempt to run qemu
+( . $T/qemu-cmd; wait `cat  $resdir/qemu_pid`; echo $? > $resdir/qemu-retval ) &
 commandcompleted=0
 sleep 10 # Give qemu's pid a chance to reach the file
 if test -s "$resdir/qemu_pid"

From 7a6bbeaa01f71af2722fd775a4a4ff9593d12838 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Tue, 16 Jun 2020 17:07:15 -0700
Subject: [PATCH 185/502] torture: Remove obsolete "cd $KVM"

In the dim distant past, qemu commands needed to be run from the
rcutorture directory, but this is no longer the case.  This commit
therefore removes the now-useless "cd $KVM" from the kvm-test-1-run.sh
script.

Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh | 1 -
 1 file changed, 1 deletion(-)

diff --git a/tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh b/tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh
index 484445bd3010..e07779a62634 100755
--- a/tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh
+++ b/tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh
@@ -124,7 +124,6 @@ seconds=$4
 qemu_args=$5
 boot_args=$6
 
-cd $KVM
 kstarttime=`gawk 'BEGIN { print systime() }' < /dev/null`
 if test -z "$TORTURE_BUILDONLY"
 then

From cda099b37d7165fc73a63961739acf026444cde2 Mon Sep 17 00:00:00 2001
From: Qian Cai <cai@lca.pw>
Date: Wed, 19 Feb 2020 11:00:54 -0800
Subject: [PATCH 186/502] fork: Annotate a data race in vm_area_dup()

struct vm_area_struct could be accessed concurrently as noticed by
KCSAN,

 write to 0xffff9cf8bba08ad8 of 8 bytes by task 14263 on cpu 35:
  vma_interval_tree_insert+0x101/0x150:
  rb_insert_augmented_cached at include/linux/rbtree_augmented.h:58
  (inlined by) vma_interval_tree_insert at mm/interval_tree.c:23
  __vma_link_file+0x6e/0xe0
  __vma_link_file at mm/mmap.c:629
  vma_link+0xa2/0x120
  mmap_region+0x753/0xb90
  do_mmap+0x45c/0x710
  vm_mmap_pgoff+0xc0/0x130
  ksys_mmap_pgoff+0x1d1/0x300
  __x64_sys_mmap+0x33/0x40
  do_syscall_64+0x91/0xc44
  entry_SYSCALL_64_after_hwframe+0x49/0xbe

 read to 0xffff9cf8bba08a80 of 200 bytes by task 14262 on cpu 122:
  vm_area_dup+0x6a/0xe0
  vm_area_dup at kernel/fork.c:362
  __split_vma+0x72/0x2a0
  __split_vma at mm/mmap.c:2661
  split_vma+0x5a/0x80
  mprotect_fixup+0x368/0x3f0
  do_mprotect_pkey+0x263/0x420
  __x64_sys_mprotect+0x51/0x70
  do_syscall_64+0x91/0xc44
  entry_SYSCALL_64_after_hwframe+0x49/0xbe

vm_area_dup() blindly copies all fields of original VMA to the new one.
This includes coping vm_area_struct::shared.rb which is normally
protected by i_mmap_lock. But this is fine because the read value will
be overwritten on the following __vma_link_file() under proper
protection. Thus, mark it as an intentional data race and insert a few
assertions for the fields that should not be modified concurrently.

Signed-off-by: Qian Cai <cai@lca.pw>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 kernel/fork.c | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/kernel/fork.c b/kernel/fork.c
index 142b23645d82..bba10fbcdce7 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -359,7 +359,13 @@ struct vm_area_struct *vm_area_dup(struct vm_area_struct *orig)
 	struct vm_area_struct *new = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL);
 
 	if (new) {
-		*new = *orig;
+		ASSERT_EXCLUSIVE_WRITER(orig->vm_flags);
+		ASSERT_EXCLUSIVE_WRITER(orig->vm_file);
+		/*
+		 * orig->shared.rb may be modified concurrently, but the clone
+		 * will be reinitialized.
+		 */
+		*new = data_race(*orig);
 		INIT_LIST_HEAD(&new->anon_vma_chain);
 		new->vm_next = new->vm_prev = NULL;
 	}

From cb38f82043d1641a27f96b58b402ca4b7a88f52d Mon Sep 17 00:00:00 2001
From: Qian Cai <cai@lca.pw>
Date: Mon, 10 Feb 2020 09:10:16 -0500
Subject: [PATCH 187/502] x86/mm/pat: Mark an intentional data race

cpa_4k_install could be accessed concurrently as noticed by KCSAN,

read to 0xffffffffaa59a000 of 8 bytes by interrupt on cpu 7:
cpa_inc_4k_install arch/x86/mm/pat/set_memory.c:131 [inline]
__change_page_attr+0x10cf/0x1840 arch/x86/mm/pat/set_memory.c:1514
__change_page_attr_set_clr+0xce/0x490 arch/x86/mm/pat/set_memory.c:1636
__set_pages_np+0xc4/0xf0 arch/x86/mm/pat/set_memory.c:2148
__kernel_map_pages+0xb0/0xc8 arch/x86/mm/pat/set_memory.c:2178
kernel_map_pages include/linux/mm.h:2719 [inline] <snip>

write to 0xffffffffaa59a000 of 8 bytes by task 1 on cpu 6:
cpa_inc_4k_install arch/x86/mm/pat/set_memory.c:131 [inline]
__change_page_attr+0x10ea/0x1840 arch/x86/mm/pat/set_memory.c:1514
__change_page_attr_set_clr+0xce/0x490 arch/x86/mm/pat/set_memory.c:1636
__set_pages_p+0xc4/0xf0 arch/x86/mm/pat/set_memory.c:2129
__kernel_map_pages+0x2e/0xc8 arch/x86/mm/pat/set_memory.c:2176
kernel_map_pages include/linux/mm.h:2719 [inline] <snip>

Both accesses are due to the same "cpa_4k_install++" in
cpa_inc_4k_install. A data race here could be potentially undesirable:
depending on compiler optimizations or how x86 executes a non-LOCK'd
increment, it may lose increments, corrupt the counter, etc. Since this
counter only seems to be used for printing some stats, this data race
itself is unlikely to cause harm to the system though. Thus, mark this
intentional data race using the data_race() marco.

Suggested-by: Macro Elver <elver@google.com>
Signed-off-by: Qian Cai <cai@lca.pw>
Acked-by: Borislav Petkov <bp@suse.de>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 arch/x86/mm/pat/set_memory.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/x86/mm/pat/set_memory.c b/arch/x86/mm/pat/set_memory.c
index 77e04304a2a7..d1b2a889f035 100644
--- a/arch/x86/mm/pat/set_memory.c
+++ b/arch/x86/mm/pat/set_memory.c
@@ -135,7 +135,7 @@ static inline void cpa_inc_2m_checked(void)
 
 static inline void cpa_inc_4k_install(void)
 {
-	cpa_4k_install++;
+	data_race(cpa_4k_install++);
 }
 
 static inline void cpa_inc_lp_sameprot(int level)

From c93773c1a3fedf6c3f6fa12833e2b74a9897c3e3 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Wed, 12 Feb 2020 13:29:15 -0800
Subject: [PATCH 188/502] rculist: Add ASSERT_EXCLUSIVE_ACCESS() to
 __list_splice_init_rcu()

After the sync() in __list_splice_init_rcu(), there should be no
readers traversing the old list.  This commit therefore enlists the
help of KCSAN to verify this condition via a pair of calls to
ASSERT_EXCLUSIVE_ACCESS().

Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
Cc: Marco Elver <elver@google.com>
---
 include/linux/rculist.h | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/include/linux/rculist.h b/include/linux/rculist.h
index df587d181844..2ebd112f86f7 100644
--- a/include/linux/rculist.h
+++ b/include/linux/rculist.h
@@ -248,6 +248,8 @@ static inline void __list_splice_init_rcu(struct list_head *list,
 	 */
 
 	sync();
+	ASSERT_EXCLUSIVE_ACCESS(*first);
+	ASSERT_EXCLUSIVE_ACCESS(*last);
 
 	/*
 	 * Readers are finished with the source list, so perform splice.

From 1fe84fd4a4027a17d511a832f89ab14107650ba4 Mon Sep 17 00:00:00 2001
From: Marco Elver <elver@google.com>
Date: Tue, 5 May 2020 20:28:21 +0200
Subject: [PATCH 189/502] kcsan: Add test suite

This adds KCSAN test focusing on behaviour of the integrated runtime.
Tests various race scenarios, and verifies the reports generated to
console. Makes use of KUnit for test organization, and the Torture
framework for test thread control.

Signed-off-by: Marco Elver <elver@google.com>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 kernel/kcsan/Makefile     |    3 +
 kernel/kcsan/kcsan-test.c | 1084 +++++++++++++++++++++++++++++++++++++
 lib/Kconfig.kcsan         |   23 +-
 3 files changed, 1109 insertions(+), 1 deletion(-)
 create mode 100644 kernel/kcsan/kcsan-test.c

diff --git a/kernel/kcsan/Makefile b/kernel/kcsan/Makefile
index d4999b38d1be..14533cf24bc3 100644
--- a/kernel/kcsan/Makefile
+++ b/kernel/kcsan/Makefile
@@ -12,3 +12,6 @@ CFLAGS_core.o := $(call cc-option,-fno-conserve-stack,) \
 
 obj-y := core.o debugfs.o report.o
 obj-$(CONFIG_KCSAN_SELFTEST) += test.o
+
+CFLAGS_kcsan-test.o := $(CFLAGS_KCSAN) -g -fno-omit-frame-pointer
+obj-$(CONFIG_KCSAN_TEST) += kcsan-test.o
diff --git a/kernel/kcsan/kcsan-test.c b/kernel/kcsan/kcsan-test.c
new file mode 100644
index 000000000000..a8c11506dd2a
--- /dev/null
+++ b/kernel/kcsan/kcsan-test.c
@@ -0,0 +1,1084 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * KCSAN test with various race scenarious to test runtime behaviour. Since the
+ * interface with which KCSAN's reports are obtained is via the console, this is
+ * the output we should verify. For each test case checks the presence (or
+ * absence) of generated reports. Relies on 'console' tracepoint to capture
+ * reports as they appear in the kernel log.
+ *
+ * Makes use of KUnit for test organization, and the Torture framework for test
+ * thread control.
+ *
+ * Copyright (C) 2020, Google LLC.
+ * Author: Marco Elver <elver@google.com>
+ */
+
+#include <kunit/test.h>
+#include <linux/jiffies.h>
+#include <linux/kcsan-checks.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/seqlock.h>
+#include <linux/spinlock.h>
+#include <linux/string.h>
+#include <linux/timer.h>
+#include <linux/torture.h>
+#include <linux/tracepoint.h>
+#include <linux/types.h>
+#include <trace/events/printk.h>
+
+/* Points to current test-case memory access "kernels". */
+static void (*access_kernels[2])(void);
+
+static struct task_struct **threads; /* Lists of threads. */
+static unsigned long end_time;       /* End time of test. */
+
+/* Report as observed from console. */
+static struct {
+	spinlock_t lock;
+	int nlines;
+	char lines[3][512];
+} observed = {
+	.lock = __SPIN_LOCK_UNLOCKED(observed.lock),
+};
+
+/* Setup test checking loop. */
+static __no_kcsan_or_inline void
+begin_test_checks(void (*func1)(void), void (*func2)(void))
+{
+	kcsan_disable_current();
+
+	/*
+	 * Require at least as long as KCSAN_REPORT_ONCE_IN_MS, to ensure at
+	 * least one race is reported.
+	 */
+	end_time = jiffies + msecs_to_jiffies(CONFIG_KCSAN_REPORT_ONCE_IN_MS + 500);
+
+	/* Signal start; release potential initialization of shared data. */
+	smp_store_release(&access_kernels[0], func1);
+	smp_store_release(&access_kernels[1], func2);
+}
+
+/* End test checking loop. */
+static __no_kcsan_or_inline bool
+end_test_checks(bool stop)
+{
+	if (!stop && time_before(jiffies, end_time)) {
+		/* Continue checking */
+		might_sleep();
+		return false;
+	}
+
+	kcsan_enable_current();
+	return true;
+}
+
+/*
+ * Probe for console output: checks if a race was reported, and obtains observed
+ * lines of interest.
+ */
+__no_kcsan
+static void probe_console(void *ignore, const char *buf, size_t len)
+{
+	unsigned long flags;
+	int nlines;
+
+	/*
+	 * Note that KCSAN reports under a global lock, so we do not risk the
+	 * possibility of having multiple reports interleaved. If that were the
+	 * case, we'd expect tests to fail.
+	 */
+
+	spin_lock_irqsave(&observed.lock, flags);
+	nlines = observed.nlines;
+
+	if (strnstr(buf, "BUG: KCSAN: ", len) && strnstr(buf, "test_", len)) {
+		/*
+		 * KCSAN report and related to the test.
+		 *
+		 * The provided @buf is not NUL-terminated; copy no more than
+		 * @len bytes and let strscpy() add the missing NUL-terminator.
+		 */
+		strscpy(observed.lines[0], buf, min(len + 1, sizeof(observed.lines[0])));
+		nlines = 1;
+	} else if ((nlines == 1 || nlines == 2) && strnstr(buf, "bytes by", len)) {
+		strscpy(observed.lines[nlines++], buf, min(len + 1, sizeof(observed.lines[0])));
+
+		if (strnstr(buf, "race at unknown origin", len)) {
+			if (WARN_ON(nlines != 2))
+				goto out;
+
+			/* No second line of interest. */
+			strcpy(observed.lines[nlines++], "<none>");
+		}
+	}
+
+out:
+	WRITE_ONCE(observed.nlines, nlines); /* Publish new nlines. */
+	spin_unlock_irqrestore(&observed.lock, flags);
+}
+
+/* Check if a report related to the test exists. */
+__no_kcsan
+static bool report_available(void)
+{
+	return READ_ONCE(observed.nlines) == ARRAY_SIZE(observed.lines);
+}
+
+/* Report information we expect in a report. */
+struct expect_report {
+	/* Access information of both accesses. */
+	struct {
+		void *fn;    /* Function pointer to expected function of top frame. */
+		void *addr;  /* Address of access; unchecked if NULL. */
+		size_t size; /* Size of access; unchecked if @addr is NULL. */
+		int type;    /* Access type, see KCSAN_ACCESS definitions. */
+	} access[2];
+};
+
+/* Check observed report matches information in @r. */
+__no_kcsan
+static bool report_matches(const struct expect_report *r)
+{
+	const bool is_assert = (r->access[0].type | r->access[1].type) & KCSAN_ACCESS_ASSERT;
+	bool ret = false;
+	unsigned long flags;
+	typeof(observed.lines) expect;
+	const char *end;
+	char *cur;
+	int i;
+
+	/* Doubled-checked locking. */
+	if (!report_available())
+		return false;
+
+	/* Generate expected report contents. */
+
+	/* Title */
+	cur = expect[0];
+	end = &expect[0][sizeof(expect[0]) - 1];
+	cur += scnprintf(cur, end - cur, "BUG: KCSAN: %s in ",
+			 is_assert ? "assert: race" : "data-race");
+	if (r->access[1].fn) {
+		char tmp[2][64];
+		int cmp;
+
+		/* Expect lexographically sorted function names in title. */
+		scnprintf(tmp[0], sizeof(tmp[0]), "%pS", r->access[0].fn);
+		scnprintf(tmp[1], sizeof(tmp[1]), "%pS", r->access[1].fn);
+		cmp = strcmp(tmp[0], tmp[1]);
+		cur += scnprintf(cur, end - cur, "%ps / %ps",
+				 cmp < 0 ? r->access[0].fn : r->access[1].fn,
+				 cmp < 0 ? r->access[1].fn : r->access[0].fn);
+	} else {
+		scnprintf(cur, end - cur, "%pS", r->access[0].fn);
+		/* The exact offset won't match, remove it. */
+		cur = strchr(expect[0], '+');
+		if (cur)
+			*cur = '\0';
+	}
+
+	/* Access 1 */
+	cur = expect[1];
+	end = &expect[1][sizeof(expect[1]) - 1];
+	if (!r->access[1].fn)
+		cur += scnprintf(cur, end - cur, "race at unknown origin, with ");
+
+	/* Access 1 & 2 */
+	for (i = 0; i < 2; ++i) {
+		const char *const access_type =
+			(r->access[i].type & KCSAN_ACCESS_ASSERT) ?
+				((r->access[i].type & KCSAN_ACCESS_WRITE) ?
+					 "assert no accesses" :
+					 "assert no writes") :
+				((r->access[i].type & KCSAN_ACCESS_WRITE) ?
+					 "write" :
+					 "read");
+		const char *const access_type_aux =
+			(r->access[i].type & KCSAN_ACCESS_ATOMIC) ?
+				" (marked)" :
+				((r->access[i].type & KCSAN_ACCESS_SCOPED) ?
+					 " (scoped)" :
+					 "");
+
+		if (i == 1) {
+			/* Access 2 */
+			cur = expect[2];
+			end = &expect[2][sizeof(expect[2]) - 1];
+
+			if (!r->access[1].fn) {
+				/* Dummy string if no second access is available. */
+				strcpy(cur, "<none>");
+				break;
+			}
+		}
+
+		cur += scnprintf(cur, end - cur, "%s%s to ", access_type,
+				 access_type_aux);
+
+		if (r->access[i].addr) /* Address is optional. */
+			cur += scnprintf(cur, end - cur, "0x%px of %zu bytes",
+					 r->access[i].addr, r->access[i].size);
+	}
+
+	spin_lock_irqsave(&observed.lock, flags);
+	if (!report_available())
+		goto out; /* A new report is being captured. */
+
+	/* Finally match expected output to what we actually observed. */
+	ret = strstr(observed.lines[0], expect[0]) &&
+	      /* Access info may appear in any order. */
+	      ((strstr(observed.lines[1], expect[1]) &&
+		strstr(observed.lines[2], expect[2])) ||
+	       (strstr(observed.lines[1], expect[2]) &&
+		strstr(observed.lines[2], expect[1])));
+out:
+	spin_unlock_irqrestore(&observed.lock, flags);
+	return ret;
+}
+
+/* ===== Test kernels ===== */
+
+static long test_sink;
+static long test_var;
+/* @test_array should be large enough to fall into multiple watchpoint slots. */
+static long test_array[3 * PAGE_SIZE / sizeof(long)];
+static struct {
+	long val[8];
+} test_struct;
+static DEFINE_SEQLOCK(test_seqlock);
+
+/*
+ * Helper to avoid compiler optimizing out reads, and to generate source values
+ * for writes.
+ */
+__no_kcsan
+static noinline void sink_value(long v) { WRITE_ONCE(test_sink, v); }
+
+static noinline void test_kernel_read(void) { sink_value(test_var); }
+
+static noinline void test_kernel_write(void)
+{
+	test_var = READ_ONCE_NOCHECK(test_sink) + 1;
+}
+
+static noinline void test_kernel_write_nochange(void) { test_var = 42; }
+
+/* Suffixed by value-change exception filter. */
+static noinline void test_kernel_write_nochange_rcu(void) { test_var = 42; }
+
+static noinline void test_kernel_read_atomic(void)
+{
+	sink_value(READ_ONCE(test_var));
+}
+
+static noinline void test_kernel_write_atomic(void)
+{
+	WRITE_ONCE(test_var, READ_ONCE_NOCHECK(test_sink) + 1);
+}
+
+__no_kcsan
+static noinline void test_kernel_write_uninstrumented(void) { test_var++; }
+
+static noinline void test_kernel_data_race(void) { data_race(test_var++); }
+
+static noinline void test_kernel_assert_writer(void)
+{
+	ASSERT_EXCLUSIVE_WRITER(test_var);
+}
+
+static noinline void test_kernel_assert_access(void)
+{
+	ASSERT_EXCLUSIVE_ACCESS(test_var);
+}
+
+#define TEST_CHANGE_BITS 0xff00ff00
+
+static noinline void test_kernel_change_bits(void)
+{
+	if (IS_ENABLED(CONFIG_KCSAN_IGNORE_ATOMICS)) {
+		/*
+		 * Avoid race of unknown origin for this test, just pretend they
+		 * are atomic.
+		 */
+		kcsan_nestable_atomic_begin();
+		test_var ^= TEST_CHANGE_BITS;
+		kcsan_nestable_atomic_end();
+	} else
+		WRITE_ONCE(test_var, READ_ONCE(test_var) ^ TEST_CHANGE_BITS);
+}
+
+static noinline void test_kernel_assert_bits_change(void)
+{
+	ASSERT_EXCLUSIVE_BITS(test_var, TEST_CHANGE_BITS);
+}
+
+static noinline void test_kernel_assert_bits_nochange(void)
+{
+	ASSERT_EXCLUSIVE_BITS(test_var, ~TEST_CHANGE_BITS);
+}
+
+/* To check that scoped assertions do trigger anywhere in scope. */
+static noinline void test_enter_scope(void)
+{
+	int x = 0;
+
+	/* Unrelated accesses to scoped assert. */
+	READ_ONCE(test_sink);
+	kcsan_check_read(&x, sizeof(x));
+}
+
+static noinline void test_kernel_assert_writer_scoped(void)
+{
+	ASSERT_EXCLUSIVE_WRITER_SCOPED(test_var);
+	test_enter_scope();
+}
+
+static noinline void test_kernel_assert_access_scoped(void)
+{
+	ASSERT_EXCLUSIVE_ACCESS_SCOPED(test_var);
+	test_enter_scope();
+}
+
+static noinline void test_kernel_rmw_array(void)
+{
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(test_array); ++i)
+		test_array[i]++;
+}
+
+static noinline void test_kernel_write_struct(void)
+{
+	kcsan_check_write(&test_struct, sizeof(test_struct));
+	kcsan_disable_current();
+	test_struct.val[3]++; /* induce value change */
+	kcsan_enable_current();
+}
+
+static noinline void test_kernel_write_struct_part(void)
+{
+	test_struct.val[3] = 42;
+}
+
+static noinline void test_kernel_read_struct_zero_size(void)
+{
+	kcsan_check_read(&test_struct.val[3], 0);
+}
+
+static noinline void test_kernel_seqlock_reader(void)
+{
+	unsigned int seq;
+
+	do {
+		seq = read_seqbegin(&test_seqlock);
+		sink_value(test_var);
+	} while (read_seqretry(&test_seqlock, seq));
+}
+
+static noinline void test_kernel_seqlock_writer(void)
+{
+	unsigned long flags;
+
+	write_seqlock_irqsave(&test_seqlock, flags);
+	test_var++;
+	write_sequnlock_irqrestore(&test_seqlock, flags);
+}
+
+/* ===== Test cases ===== */
+
+/* Simple test with normal data race. */
+__no_kcsan
+static void test_basic(struct kunit *test)
+{
+	const struct expect_report expect = {
+		.access = {
+			{ test_kernel_write, &test_var, sizeof(test_var), KCSAN_ACCESS_WRITE },
+			{ test_kernel_read, &test_var, sizeof(test_var), 0 },
+		},
+	};
+	static const struct expect_report never = {
+		.access = {
+			{ test_kernel_read, &test_var, sizeof(test_var), 0 },
+			{ test_kernel_read, &test_var, sizeof(test_var), 0 },
+		},
+	};
+	bool match_expect = false;
+	bool match_never = false;
+
+	begin_test_checks(test_kernel_write, test_kernel_read);
+	do {
+		match_expect |= report_matches(&expect);
+		match_never = report_matches(&never);
+	} while (!end_test_checks(match_never));
+	KUNIT_EXPECT_TRUE(test, match_expect);
+	KUNIT_EXPECT_FALSE(test, match_never);
+}
+
+/*
+ * Stress KCSAN with lots of concurrent races on different addresses until
+ * timeout.
+ */
+__no_kcsan
+static void test_concurrent_races(struct kunit *test)
+{
+	const struct expect_report expect = {
+		.access = {
+			/* NULL will match any address. */
+			{ test_kernel_rmw_array, NULL, 0, KCSAN_ACCESS_WRITE },
+			{ test_kernel_rmw_array, NULL, 0, 0 },
+		},
+	};
+	static const struct expect_report never = {
+		.access = {
+			{ test_kernel_rmw_array, NULL, 0, 0 },
+			{ test_kernel_rmw_array, NULL, 0, 0 },
+		},
+	};
+	bool match_expect = false;
+	bool match_never = false;
+
+	begin_test_checks(test_kernel_rmw_array, test_kernel_rmw_array);
+	do {
+		match_expect |= report_matches(&expect);
+		match_never |= report_matches(&never);
+	} while (!end_test_checks(false));
+	KUNIT_EXPECT_TRUE(test, match_expect); /* Sanity check matches exist. */
+	KUNIT_EXPECT_FALSE(test, match_never);
+}
+
+/* Test the KCSAN_REPORT_VALUE_CHANGE_ONLY option. */
+__no_kcsan
+static void test_novalue_change(struct kunit *test)
+{
+	const struct expect_report expect = {
+		.access = {
+			{ test_kernel_write_nochange, &test_var, sizeof(test_var), KCSAN_ACCESS_WRITE },
+			{ test_kernel_read, &test_var, sizeof(test_var), 0 },
+		},
+	};
+	bool match_expect = false;
+
+	begin_test_checks(test_kernel_write_nochange, test_kernel_read);
+	do {
+		match_expect = report_matches(&expect);
+	} while (!end_test_checks(match_expect));
+	if (IS_ENABLED(CONFIG_KCSAN_REPORT_VALUE_CHANGE_ONLY))
+		KUNIT_EXPECT_FALSE(test, match_expect);
+	else
+		KUNIT_EXPECT_TRUE(test, match_expect);
+}
+
+/*
+ * Test that the rules where the KCSAN_REPORT_VALUE_CHANGE_ONLY option should
+ * never apply work.
+ */
+__no_kcsan
+static void test_novalue_change_exception(struct kunit *test)
+{
+	const struct expect_report expect = {
+		.access = {
+			{ test_kernel_write_nochange_rcu, &test_var, sizeof(test_var), KCSAN_ACCESS_WRITE },
+			{ test_kernel_read, &test_var, sizeof(test_var), 0 },
+		},
+	};
+	bool match_expect = false;
+
+	begin_test_checks(test_kernel_write_nochange_rcu, test_kernel_read);
+	do {
+		match_expect = report_matches(&expect);
+	} while (!end_test_checks(match_expect));
+	KUNIT_EXPECT_TRUE(test, match_expect);
+}
+
+/* Test that data races of unknown origin are reported. */
+__no_kcsan
+static void test_unknown_origin(struct kunit *test)
+{
+	const struct expect_report expect = {
+		.access = {
+			{ test_kernel_read, &test_var, sizeof(test_var), 0 },
+			{ NULL },
+		},
+	};
+	bool match_expect = false;
+
+	begin_test_checks(test_kernel_write_uninstrumented, test_kernel_read);
+	do {
+		match_expect = report_matches(&expect);
+	} while (!end_test_checks(match_expect));
+	if (IS_ENABLED(CONFIG_KCSAN_REPORT_RACE_UNKNOWN_ORIGIN))
+		KUNIT_EXPECT_TRUE(test, match_expect);
+	else
+		KUNIT_EXPECT_FALSE(test, match_expect);
+}
+
+/* Test KCSAN_ASSUME_PLAIN_WRITES_ATOMIC if it is selected. */
+__no_kcsan
+static void test_write_write_assume_atomic(struct kunit *test)
+{
+	const struct expect_report expect = {
+		.access = {
+			{ test_kernel_write, &test_var, sizeof(test_var), KCSAN_ACCESS_WRITE },
+			{ test_kernel_write, &test_var, sizeof(test_var), KCSAN_ACCESS_WRITE },
+		},
+	};
+	bool match_expect = false;
+
+	begin_test_checks(test_kernel_write, test_kernel_write);
+	do {
+		sink_value(READ_ONCE(test_var)); /* induce value-change */
+		match_expect = report_matches(&expect);
+	} while (!end_test_checks(match_expect));
+	if (IS_ENABLED(CONFIG_KCSAN_ASSUME_PLAIN_WRITES_ATOMIC))
+		KUNIT_EXPECT_FALSE(test, match_expect);
+	else
+		KUNIT_EXPECT_TRUE(test, match_expect);
+}
+
+/*
+ * Test that data races with writes larger than word-size are always reported,
+ * even if KCSAN_ASSUME_PLAIN_WRITES_ATOMIC is selected.
+ */
+__no_kcsan
+static void test_write_write_struct(struct kunit *test)
+{
+	const struct expect_report expect = {
+		.access = {
+			{ test_kernel_write_struct, &test_struct, sizeof(test_struct), KCSAN_ACCESS_WRITE },
+			{ test_kernel_write_struct, &test_struct, sizeof(test_struct), KCSAN_ACCESS_WRITE },
+		},
+	};
+	bool match_expect = false;
+
+	begin_test_checks(test_kernel_write_struct, test_kernel_write_struct);
+	do {
+		match_expect = report_matches(&expect);
+	} while (!end_test_checks(match_expect));
+	KUNIT_EXPECT_TRUE(test, match_expect);
+}
+
+/*
+ * Test that data races where only one write is larger than word-size are always
+ * reported, even if KCSAN_ASSUME_PLAIN_WRITES_ATOMIC is selected.
+ */
+__no_kcsan
+static void test_write_write_struct_part(struct kunit *test)
+{
+	const struct expect_report expect = {
+		.access = {
+			{ test_kernel_write_struct, &test_struct, sizeof(test_struct), KCSAN_ACCESS_WRITE },
+			{ test_kernel_write_struct_part, &test_struct.val[3], sizeof(test_struct.val[3]), KCSAN_ACCESS_WRITE },
+		},
+	};
+	bool match_expect = false;
+
+	begin_test_checks(test_kernel_write_struct, test_kernel_write_struct_part);
+	do {
+		match_expect = report_matches(&expect);
+	} while (!end_test_checks(match_expect));
+	KUNIT_EXPECT_TRUE(test, match_expect);
+}
+
+/* Test that races with atomic accesses never result in reports. */
+__no_kcsan
+static void test_read_atomic_write_atomic(struct kunit *test)
+{
+	bool match_never = false;
+
+	begin_test_checks(test_kernel_read_atomic, test_kernel_write_atomic);
+	do {
+		match_never = report_available();
+	} while (!end_test_checks(match_never));
+	KUNIT_EXPECT_FALSE(test, match_never);
+}
+
+/* Test that a race with an atomic and plain access result in reports. */
+__no_kcsan
+static void test_read_plain_atomic_write(struct kunit *test)
+{
+	const struct expect_report expect = {
+		.access = {
+			{ test_kernel_read, &test_var, sizeof(test_var), 0 },
+			{ test_kernel_write_atomic, &test_var, sizeof(test_var), KCSAN_ACCESS_WRITE | KCSAN_ACCESS_ATOMIC },
+		},
+	};
+	bool match_expect = false;
+
+	if (IS_ENABLED(CONFIG_KCSAN_IGNORE_ATOMICS))
+		return;
+
+	begin_test_checks(test_kernel_read, test_kernel_write_atomic);
+	do {
+		match_expect = report_matches(&expect);
+	} while (!end_test_checks(match_expect));
+	KUNIT_EXPECT_TRUE(test, match_expect);
+}
+
+/* Zero-sized accesses should never cause data race reports. */
+__no_kcsan
+static void test_zero_size_access(struct kunit *test)
+{
+	const struct expect_report expect = {
+		.access = {
+			{ test_kernel_write_struct, &test_struct, sizeof(test_struct), KCSAN_ACCESS_WRITE },
+			{ test_kernel_write_struct, &test_struct, sizeof(test_struct), KCSAN_ACCESS_WRITE },
+		},
+	};
+	const struct expect_report never = {
+		.access = {
+			{ test_kernel_write_struct, &test_struct, sizeof(test_struct), KCSAN_ACCESS_WRITE },
+			{ test_kernel_read_struct_zero_size, &test_struct.val[3], 0, 0 },
+		},
+	};
+	bool match_expect = false;
+	bool match_never = false;
+
+	begin_test_checks(test_kernel_write_struct, test_kernel_read_struct_zero_size);
+	do {
+		match_expect |= report_matches(&expect);
+		match_never = report_matches(&never);
+	} while (!end_test_checks(match_never));
+	KUNIT_EXPECT_TRUE(test, match_expect); /* Sanity check. */
+	KUNIT_EXPECT_FALSE(test, match_never);
+}
+
+/* Test the data_race() macro. */
+__no_kcsan
+static void test_data_race(struct kunit *test)
+{
+	bool match_never = false;
+
+	begin_test_checks(test_kernel_data_race, test_kernel_data_race);
+	do {
+		match_never = report_available();
+	} while (!end_test_checks(match_never));
+	KUNIT_EXPECT_FALSE(test, match_never);
+}
+
+__no_kcsan
+static void test_assert_exclusive_writer(struct kunit *test)
+{
+	const struct expect_report expect = {
+		.access = {
+			{ test_kernel_assert_writer, &test_var, sizeof(test_var), KCSAN_ACCESS_ASSERT },
+			{ test_kernel_write_nochange, &test_var, sizeof(test_var), KCSAN_ACCESS_WRITE },
+		},
+	};
+	bool match_expect = false;
+
+	begin_test_checks(test_kernel_assert_writer, test_kernel_write_nochange);
+	do {
+		match_expect = report_matches(&expect);
+	} while (!end_test_checks(match_expect));
+	KUNIT_EXPECT_TRUE(test, match_expect);
+}
+
+__no_kcsan
+static void test_assert_exclusive_access(struct kunit *test)
+{
+	const struct expect_report expect = {
+		.access = {
+			{ test_kernel_assert_access, &test_var, sizeof(test_var), KCSAN_ACCESS_ASSERT | KCSAN_ACCESS_WRITE },
+			{ test_kernel_read, &test_var, sizeof(test_var), 0 },
+		},
+	};
+	bool match_expect = false;
+
+	begin_test_checks(test_kernel_assert_access, test_kernel_read);
+	do {
+		match_expect = report_matches(&expect);
+	} while (!end_test_checks(match_expect));
+	KUNIT_EXPECT_TRUE(test, match_expect);
+}
+
+__no_kcsan
+static void test_assert_exclusive_access_writer(struct kunit *test)
+{
+	const struct expect_report expect_access_writer = {
+		.access = {
+			{ test_kernel_assert_access, &test_var, sizeof(test_var), KCSAN_ACCESS_ASSERT | KCSAN_ACCESS_WRITE },
+			{ test_kernel_assert_writer, &test_var, sizeof(test_var), KCSAN_ACCESS_ASSERT },
+		},
+	};
+	const struct expect_report expect_access_access = {
+		.access = {
+			{ test_kernel_assert_access, &test_var, sizeof(test_var), KCSAN_ACCESS_ASSERT | KCSAN_ACCESS_WRITE },
+			{ test_kernel_assert_access, &test_var, sizeof(test_var), KCSAN_ACCESS_ASSERT | KCSAN_ACCESS_WRITE },
+		},
+	};
+	const struct expect_report never = {
+		.access = {
+			{ test_kernel_assert_writer, &test_var, sizeof(test_var), KCSAN_ACCESS_ASSERT },
+			{ test_kernel_assert_writer, &test_var, sizeof(test_var), KCSAN_ACCESS_ASSERT },
+		},
+	};
+	bool match_expect_access_writer = false;
+	bool match_expect_access_access = false;
+	bool match_never = false;
+
+	begin_test_checks(test_kernel_assert_access, test_kernel_assert_writer);
+	do {
+		match_expect_access_writer |= report_matches(&expect_access_writer);
+		match_expect_access_access |= report_matches(&expect_access_access);
+		match_never |= report_matches(&never);
+	} while (!end_test_checks(match_never));
+	KUNIT_EXPECT_TRUE(test, match_expect_access_writer);
+	KUNIT_EXPECT_TRUE(test, match_expect_access_access);
+	KUNIT_EXPECT_FALSE(test, match_never);
+}
+
+__no_kcsan
+static void test_assert_exclusive_bits_change(struct kunit *test)
+{
+	const struct expect_report expect = {
+		.access = {
+			{ test_kernel_assert_bits_change, &test_var, sizeof(test_var), KCSAN_ACCESS_ASSERT },
+			{ test_kernel_change_bits, &test_var, sizeof(test_var),
+				KCSAN_ACCESS_WRITE | (IS_ENABLED(CONFIG_KCSAN_IGNORE_ATOMICS) ? 0 : KCSAN_ACCESS_ATOMIC) },
+		},
+	};
+	bool match_expect = false;
+
+	begin_test_checks(test_kernel_assert_bits_change, test_kernel_change_bits);
+	do {
+		match_expect = report_matches(&expect);
+	} while (!end_test_checks(match_expect));
+	KUNIT_EXPECT_TRUE(test, match_expect);
+}
+
+__no_kcsan
+static void test_assert_exclusive_bits_nochange(struct kunit *test)
+{
+	bool match_never = false;
+
+	begin_test_checks(test_kernel_assert_bits_nochange, test_kernel_change_bits);
+	do {
+		match_never = report_available();
+	} while (!end_test_checks(match_never));
+	KUNIT_EXPECT_FALSE(test, match_never);
+}
+
+__no_kcsan
+static void test_assert_exclusive_writer_scoped(struct kunit *test)
+{
+	const struct expect_report expect_start = {
+		.access = {
+			{ test_kernel_assert_writer_scoped, &test_var, sizeof(test_var), KCSAN_ACCESS_ASSERT | KCSAN_ACCESS_SCOPED },
+			{ test_kernel_write_nochange, &test_var, sizeof(test_var), KCSAN_ACCESS_WRITE },
+		},
+	};
+	const struct expect_report expect_anywhere = {
+		.access = {
+			{ test_enter_scope, &test_var, sizeof(test_var), KCSAN_ACCESS_ASSERT | KCSAN_ACCESS_SCOPED },
+			{ test_kernel_write_nochange, &test_var, sizeof(test_var), KCSAN_ACCESS_WRITE },
+		},
+	};
+	bool match_expect_start = false;
+	bool match_expect_anywhere = false;
+
+	begin_test_checks(test_kernel_assert_writer_scoped, test_kernel_write_nochange);
+	do {
+		match_expect_start |= report_matches(&expect_start);
+		match_expect_anywhere |= report_matches(&expect_anywhere);
+	} while (!end_test_checks(match_expect_start && match_expect_anywhere));
+	KUNIT_EXPECT_TRUE(test, match_expect_start);
+	KUNIT_EXPECT_TRUE(test, match_expect_anywhere);
+}
+
+__no_kcsan
+static void test_assert_exclusive_access_scoped(struct kunit *test)
+{
+	const struct expect_report expect_start1 = {
+		.access = {
+			{ test_kernel_assert_access_scoped, &test_var, sizeof(test_var), KCSAN_ACCESS_ASSERT | KCSAN_ACCESS_WRITE | KCSAN_ACCESS_SCOPED },
+			{ test_kernel_read, &test_var, sizeof(test_var), 0 },
+		},
+	};
+	const struct expect_report expect_start2 = {
+		.access = { expect_start1.access[0], expect_start1.access[0] },
+	};
+	const struct expect_report expect_inscope = {
+		.access = {
+			{ test_enter_scope, &test_var, sizeof(test_var), KCSAN_ACCESS_ASSERT | KCSAN_ACCESS_WRITE | KCSAN_ACCESS_SCOPED },
+			{ test_kernel_read, &test_var, sizeof(test_var), 0 },
+		},
+	};
+	bool match_expect_start = false;
+	bool match_expect_inscope = false;
+
+	begin_test_checks(test_kernel_assert_access_scoped, test_kernel_read);
+	end_time += msecs_to_jiffies(1000); /* This test requires a bit more time. */
+	do {
+		match_expect_start |= report_matches(&expect_start1) || report_matches(&expect_start2);
+		match_expect_inscope |= report_matches(&expect_inscope);
+	} while (!end_test_checks(match_expect_start && match_expect_inscope));
+	KUNIT_EXPECT_TRUE(test, match_expect_start);
+	KUNIT_EXPECT_TRUE(test, match_expect_inscope);
+}
+
+/* Test that racing accesses in seqlock critical sections are not reported. */
+__no_kcsan
+static void test_seqlock_noreport(struct kunit *test)
+{
+	bool match_never = false;
+
+	begin_test_checks(test_kernel_seqlock_reader, test_kernel_seqlock_writer);
+	do {
+		match_never = report_available();
+	} while (!end_test_checks(match_never));
+	KUNIT_EXPECT_FALSE(test, match_never);
+}
+
+/*
+ * Each test case is run with different numbers of threads. Until KUnit supports
+ * passing arguments for each test case, we encode #threads in the test case
+ * name (read by get_num_threads()). [The '-' was chosen as a stylistic
+ * preference to separate test name and #threads.]
+ *
+ * The thread counts are chosen to cover potentially interesting boundaries and
+ * corner cases (range 2-5), and then stress the system with larger counts.
+ */
+#define KCSAN_KUNIT_CASE(test_name)                                            \
+	{ .run_case = test_name, .name = #test_name "-02" },                   \
+	{ .run_case = test_name, .name = #test_name "-03" },                   \
+	{ .run_case = test_name, .name = #test_name "-04" },                   \
+	{ .run_case = test_name, .name = #test_name "-05" },                   \
+	{ .run_case = test_name, .name = #test_name "-08" },                   \
+	{ .run_case = test_name, .name = #test_name "-16" }
+
+static struct kunit_case kcsan_test_cases[] = {
+	KCSAN_KUNIT_CASE(test_basic),
+	KCSAN_KUNIT_CASE(test_concurrent_races),
+	KCSAN_KUNIT_CASE(test_novalue_change),
+	KCSAN_KUNIT_CASE(test_novalue_change_exception),
+	KCSAN_KUNIT_CASE(test_unknown_origin),
+	KCSAN_KUNIT_CASE(test_write_write_assume_atomic),
+	KCSAN_KUNIT_CASE(test_write_write_struct),
+	KCSAN_KUNIT_CASE(test_write_write_struct_part),
+	KCSAN_KUNIT_CASE(test_read_atomic_write_atomic),
+	KCSAN_KUNIT_CASE(test_read_plain_atomic_write),
+	KCSAN_KUNIT_CASE(test_zero_size_access),
+	KCSAN_KUNIT_CASE(test_data_race),
+	KCSAN_KUNIT_CASE(test_assert_exclusive_writer),
+	KCSAN_KUNIT_CASE(test_assert_exclusive_access),
+	KCSAN_KUNIT_CASE(test_assert_exclusive_access_writer),
+	KCSAN_KUNIT_CASE(test_assert_exclusive_bits_change),
+	KCSAN_KUNIT_CASE(test_assert_exclusive_bits_nochange),
+	KCSAN_KUNIT_CASE(test_assert_exclusive_writer_scoped),
+	KCSAN_KUNIT_CASE(test_assert_exclusive_access_scoped),
+	KCSAN_KUNIT_CASE(test_seqlock_noreport),
+	{},
+};
+
+/* ===== End test cases ===== */
+
+/* Get number of threads encoded in test name. */
+static bool __no_kcsan
+get_num_threads(const char *test, int *nthreads)
+{
+	int len = strlen(test);
+
+	if (WARN_ON(len < 3))
+		return false;
+
+	*nthreads = test[len - 1] - '0';
+	*nthreads += (test[len - 2] - '0') * 10;
+
+	if (WARN_ON(*nthreads < 0))
+		return false;
+
+	return true;
+}
+
+/* Concurrent accesses from interrupts. */
+__no_kcsan
+static void access_thread_timer(struct timer_list *timer)
+{
+	static atomic_t cnt = ATOMIC_INIT(0);
+	unsigned int idx;
+	void (*func)(void);
+
+	idx = (unsigned int)atomic_inc_return(&cnt) % ARRAY_SIZE(access_kernels);
+	/* Acquire potential initialization. */
+	func = smp_load_acquire(&access_kernels[idx]);
+	if (func)
+		func();
+}
+
+/* The main loop for each thread. */
+__no_kcsan
+static int access_thread(void *arg)
+{
+	struct timer_list timer;
+	unsigned int cnt = 0;
+	unsigned int idx;
+	void (*func)(void);
+
+	timer_setup_on_stack(&timer, access_thread_timer, 0);
+	do {
+		might_sleep();
+
+		if (!timer_pending(&timer))
+			mod_timer(&timer, jiffies + 1);
+		else {
+			/* Iterate through all kernels. */
+			idx = cnt++ % ARRAY_SIZE(access_kernels);
+			/* Acquire potential initialization. */
+			func = smp_load_acquire(&access_kernels[idx]);
+			if (func)
+				func();
+		}
+	} while (!torture_must_stop());
+	del_timer_sync(&timer);
+	destroy_timer_on_stack(&timer);
+
+	torture_kthread_stopping("access_thread");
+	return 0;
+}
+
+__no_kcsan
+static int test_init(struct kunit *test)
+{
+	unsigned long flags;
+	int nthreads;
+	int i;
+
+	spin_lock_irqsave(&observed.lock, flags);
+	for (i = 0; i < ARRAY_SIZE(observed.lines); ++i)
+		observed.lines[i][0] = '\0';
+	observed.nlines = 0;
+	spin_unlock_irqrestore(&observed.lock, flags);
+
+	if (!torture_init_begin((char *)test->name, 1))
+		return -EBUSY;
+
+	if (!get_num_threads(test->name, &nthreads))
+		goto err;
+
+	if (WARN_ON(threads))
+		goto err;
+
+	for (i = 0; i < ARRAY_SIZE(access_kernels); ++i) {
+		if (WARN_ON(access_kernels[i]))
+			goto err;
+	}
+
+	if (!IS_ENABLED(CONFIG_PREEMPT) || !IS_ENABLED(CONFIG_KCSAN_INTERRUPT_WATCHER)) {
+		/*
+		 * Without any preemption, keep 2 CPUs free for other tasks, one
+		 * of which is the main test case function checking for
+		 * completion or failure.
+		 */
+		const int min_unused_cpus = IS_ENABLED(CONFIG_PREEMPT_NONE) ? 2 : 0;
+		const int min_required_cpus = 2 + min_unused_cpus;
+
+		if (num_online_cpus() < min_required_cpus) {
+			pr_err("%s: too few online CPUs (%u < %d) for test",
+			       test->name, num_online_cpus(), min_required_cpus);
+			goto err;
+		} else if (nthreads > num_online_cpus() - min_unused_cpus) {
+			nthreads = num_online_cpus() - min_unused_cpus;
+			pr_warn("%s: limiting number of threads to %d\n",
+				test->name, nthreads);
+		}
+	}
+
+	if (nthreads) {
+		threads = kcalloc(nthreads + 1, sizeof(struct task_struct *),
+				  GFP_KERNEL);
+		if (WARN_ON(!threads))
+			goto err;
+
+		threads[nthreads] = NULL;
+		for (i = 0; i < nthreads; ++i) {
+			if (torture_create_kthread(access_thread, NULL,
+						   threads[i]))
+				goto err;
+		}
+	}
+
+	torture_init_end();
+
+	return 0;
+
+err:
+	kfree(threads);
+	threads = NULL;
+	torture_init_end();
+	return -EINVAL;
+}
+
+__no_kcsan
+static void test_exit(struct kunit *test)
+{
+	struct task_struct **stop_thread;
+	int i;
+
+	if (torture_cleanup_begin())
+		return;
+
+	for (i = 0; i < ARRAY_SIZE(access_kernels); ++i)
+		WRITE_ONCE(access_kernels[i], NULL);
+
+	if (threads) {
+		for (stop_thread = threads; *stop_thread; stop_thread++)
+			torture_stop_kthread(reader_thread, *stop_thread);
+
+		kfree(threads);
+		threads = NULL;
+	}
+
+	torture_cleanup_end();
+}
+
+static struct kunit_suite kcsan_test_suite = {
+	.name = "kcsan-test",
+	.test_cases = kcsan_test_cases,
+	.init = test_init,
+	.exit = test_exit,
+};
+static struct kunit_suite *kcsan_test_suites[] = { &kcsan_test_suite, NULL };
+
+__no_kcsan
+static void register_tracepoints(struct tracepoint *tp, void *ignore)
+{
+	check_trace_callback_type_console(probe_console);
+	if (!strcmp(tp->name, "console"))
+		WARN_ON(tracepoint_probe_register(tp, probe_console, NULL));
+}
+
+__no_kcsan
+static void unregister_tracepoints(struct tracepoint *tp, void *ignore)
+{
+	if (!strcmp(tp->name, "console"))
+		tracepoint_probe_unregister(tp, probe_console, NULL);
+}
+
+/*
+ * We only want to do tracepoints setup and teardown once, therefore we have to
+ * customize the init and exit functions and cannot rely on kunit_test_suite().
+ */
+static int __init kcsan_test_init(void)
+{
+	/*
+	 * Because we want to be able to build the test as a module, we need to
+	 * iterate through all known tracepoints, since the static registration
+	 * won't work here.
+	 */
+	for_each_kernel_tracepoint(register_tracepoints, NULL);
+	return __kunit_test_suites_init(kcsan_test_suites);
+}
+
+static void kcsan_test_exit(void)
+{
+	__kunit_test_suites_exit(kcsan_test_suites);
+	for_each_kernel_tracepoint(unregister_tracepoints, NULL);
+	tracepoint_synchronize_unregister();
+}
+
+late_initcall(kcsan_test_init);
+module_exit(kcsan_test_exit);
+
+MODULE_LICENSE("GPL v2");
+MODULE_AUTHOR("Marco Elver <elver@google.com>");
diff --git a/lib/Kconfig.kcsan b/lib/Kconfig.kcsan
index 5ee88e5119c2..3f3b5bca7a8f 100644
--- a/lib/Kconfig.kcsan
+++ b/lib/Kconfig.kcsan
@@ -59,7 +59,28 @@ config KCSAN_SELFTEST
 	bool "Perform short selftests on boot"
 	default y
 	help
-	  Run KCSAN selftests on boot. On test failure, causes the kernel to panic.
+	  Run KCSAN selftests on boot. On test failure, causes the kernel to
+	  panic. Recommended to be enabled, ensuring critical functionality
+	  works as intended.
+
+config KCSAN_TEST
+	tristate "KCSAN test for integrated runtime behaviour"
+	depends on TRACEPOINTS && KUNIT
+	select TORTURE_TEST
+	help
+	  KCSAN test focusing on behaviour of the integrated runtime. Tests
+	  various race scenarios, and verifies the reports generated to
+	  console. Makes use of KUnit for test organization, and the Torture
+	  framework for test thread control.
+
+	  Each test case may run at least up to KCSAN_REPORT_ONCE_IN_MS
+	  milliseconds. Test run duration may be optimized by building the
+	  kernel and KCSAN test with KCSAN_REPORT_ONCE_IN_MS set to a lower
+	  than default value.
+
+	  Say Y here if you want the test to be built into the kernel and run
+	  during boot; say M if you want the test to build as a module; say N
+	  if you are unsure.
 
 config KCSAN_EARLY_ENABLE
 	bool "Early enable during boot"

From 33190b675ce2eacbeb4e75168c05b41110b506ec Mon Sep 17 00:00:00 2001
From: Qian Cai <cai@lca.pw>
Date: Tue, 11 Feb 2020 08:54:15 -0500
Subject: [PATCH 190/502] locking/osq_lock: Annotate a data race in osq_lock

The prev->next pointer can be accessed concurrently as noticed by KCSAN:

 write (marked) to 0xffff9d3370dbbe40 of 8 bytes by task 3294 on cpu 107:
  osq_lock+0x25f/0x350
  osq_wait_next at kernel/locking/osq_lock.c:79
  (inlined by) osq_lock at kernel/locking/osq_lock.c:185
  rwsem_optimistic_spin
  <snip>

 read to 0xffff9d3370dbbe40 of 8 bytes by task 3398 on cpu 100:
  osq_lock+0x196/0x350
  osq_lock at kernel/locking/osq_lock.c:157
  rwsem_optimistic_spin
  <snip>

Since the write only stores NULL to prev->next and the read tests if
prev->next equals to this_cpu_ptr(&osq_node). Even if the value is
shattered, the code is still working correctly. Thus, mark it as an
intentional data race using the data_race() macro.

Signed-off-by: Qian Cai <cai@lca.pw>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 kernel/locking/osq_lock.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/kernel/locking/osq_lock.c b/kernel/locking/osq_lock.c
index 1f7734949ac8..1de006ed3aa8 100644
--- a/kernel/locking/osq_lock.c
+++ b/kernel/locking/osq_lock.c
@@ -154,7 +154,11 @@ bool osq_lock(struct optimistic_spin_queue *lock)
 	 */
 
 	for (;;) {
-		if (prev->next == node &&
+		/*
+		 * cpu_relax() below implies a compiler barrier which would
+		 * prevent this comparison being optimized away.
+		 */
+		if (data_race(prev->next) == node &&
 		    cmpxchg(&prev->next, node, NULL) == node)
 			break;
 

From 2888557f68db334a3839dcc262264a4c436f576b Mon Sep 17 00:00:00 2001
From: Marco Elver <elver@google.com>
Date: Tue, 2 Jun 2020 16:36:33 +0200
Subject: [PATCH 191/502] kcsan: Prefer '__no_kcsan inline' in test

Instead of __no_kcsan_or_inline, prefer '__no_kcsan inline' in test --
this is in case we decide to remove __no_kcsan_or_inline.

Suggested-by: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Marco Elver <elver@google.com>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 kernel/kcsan/kcsan-test.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/kernel/kcsan/kcsan-test.c b/kernel/kcsan/kcsan-test.c
index a8c11506dd2a..3af420ad6ee7 100644
--- a/kernel/kcsan/kcsan-test.c
+++ b/kernel/kcsan/kcsan-test.c
@@ -43,7 +43,7 @@ static struct {
 };
 
 /* Setup test checking loop. */
-static __no_kcsan_or_inline void
+static __no_kcsan inline void
 begin_test_checks(void (*func1)(void), void (*func2)(void))
 {
 	kcsan_disable_current();
@@ -60,7 +60,7 @@ begin_test_checks(void (*func1)(void), void (*func2)(void))
 }
 
 /* End test checking loop. */
-static __no_kcsan_or_inline bool
+static __no_kcsan inline bool
 end_test_checks(bool stop)
 {
 	if (!stop && time_before(jiffies, end_time)) {

From 9dd979bae4cf76558ff816abe83283308fb1ae8c Mon Sep 17 00:00:00 2001
From: Marco Elver <elver@google.com>
Date: Tue, 16 Jun 2020 14:36:22 +0200
Subject: [PATCH 192/502] kcsan: Silence -Wmissing-prototypes warning with W=1

The functions here should not be forward declared for explicit use
elsewhere in the kernel, as they should only be emitted by the compiler
due to sanitizer instrumentation.  Add forward declarations a line above
their definition to shut up warnings in W=1 builds.

Link: https://lkml.kernel.org/r/202006060103.jSCpnV1g%lkp@intel.com
Reported-by: kernel test robot <lkp@intel.com>
Signed-off-by: Marco Elver <elver@google.com>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 kernel/kcsan/core.c | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/kernel/kcsan/core.c b/kernel/kcsan/core.c
index 15f67949d11e..1866bafda4fd 100644
--- a/kernel/kcsan/core.c
+++ b/kernel/kcsan/core.c
@@ -754,6 +754,7 @@ EXPORT_SYMBOL(__kcsan_check_access);
  */
 
 #define DEFINE_TSAN_READ_WRITE(size)                                           \
+	void __tsan_read##size(void *ptr);                                     \
 	void __tsan_read##size(void *ptr)                                      \
 	{                                                                      \
 		check_access(ptr, size, 0);                                    \
@@ -762,6 +763,7 @@ EXPORT_SYMBOL(__kcsan_check_access);
 	void __tsan_unaligned_read##size(void *ptr)                            \
 		__alias(__tsan_read##size);                                    \
 	EXPORT_SYMBOL(__tsan_unaligned_read##size);                            \
+	void __tsan_write##size(void *ptr);                                    \
 	void __tsan_write##size(void *ptr)                                     \
 	{                                                                      \
 		check_access(ptr, size, KCSAN_ACCESS_WRITE);                   \
@@ -777,12 +779,14 @@ DEFINE_TSAN_READ_WRITE(4);
 DEFINE_TSAN_READ_WRITE(8);
 DEFINE_TSAN_READ_WRITE(16);
 
+void __tsan_read_range(void *ptr, size_t size);
 void __tsan_read_range(void *ptr, size_t size)
 {
 	check_access(ptr, size, 0);
 }
 EXPORT_SYMBOL(__tsan_read_range);
 
+void __tsan_write_range(void *ptr, size_t size);
 void __tsan_write_range(void *ptr, size_t size)
 {
 	check_access(ptr, size, KCSAN_ACCESS_WRITE);
@@ -799,6 +803,7 @@ EXPORT_SYMBOL(__tsan_write_range);
  * the size-check of compiletime_assert_rwonce_type().
  */
 #define DEFINE_TSAN_VOLATILE_READ_WRITE(size)                                  \
+	void __tsan_volatile_read##size(void *ptr);                            \
 	void __tsan_volatile_read##size(void *ptr)                             \
 	{                                                                      \
 		const bool is_atomic = size <= sizeof(long long) &&            \
@@ -811,6 +816,7 @@ EXPORT_SYMBOL(__tsan_write_range);
 	void __tsan_unaligned_volatile_read##size(void *ptr)                   \
 		__alias(__tsan_volatile_read##size);                           \
 	EXPORT_SYMBOL(__tsan_unaligned_volatile_read##size);                   \
+	void __tsan_volatile_write##size(void *ptr);                           \
 	void __tsan_volatile_write##size(void *ptr)                            \
 	{                                                                      \
 		const bool is_atomic = size <= sizeof(long long) &&            \
@@ -836,14 +842,17 @@ DEFINE_TSAN_VOLATILE_READ_WRITE(16);
  * The below are not required by KCSAN, but can still be emitted by the
  * compiler.
  */
+void __tsan_func_entry(void *call_pc);
 void __tsan_func_entry(void *call_pc)
 {
 }
 EXPORT_SYMBOL(__tsan_func_entry);
+void __tsan_func_exit(void);
 void __tsan_func_exit(void)
 {
 }
 EXPORT_SYMBOL(__tsan_func_exit);
+void __tsan_init(void);
 void __tsan_init(void)
 {
 }

From acfa087ccf2d2eff46186477f53e4c3ffbdb033d Mon Sep 17 00:00:00 2001
From: Marco Elver <elver@google.com>
Date: Tue, 16 Jun 2020 14:36:23 +0200
Subject: [PATCH 193/502] kcsan: Rename test.c to selftest.c

Rename 'test.c' to 'selftest.c' to better reflect its purpose (Kconfig
variable and code inside already match this). This is to avoid confusion
with the test suite module in 'kcsan-test.c'.

No functional change.

Signed-off-by: Marco Elver <elver@google.com>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 kernel/kcsan/Makefile               | 2 +-
 kernel/kcsan/{test.c => selftest.c} | 0
 2 files changed, 1 insertion(+), 1 deletion(-)
 rename kernel/kcsan/{test.c => selftest.c} (100%)

diff --git a/kernel/kcsan/Makefile b/kernel/kcsan/Makefile
index 14533cf24bc3..092ce58d2e56 100644
--- a/kernel/kcsan/Makefile
+++ b/kernel/kcsan/Makefile
@@ -11,7 +11,7 @@ CFLAGS_core.o := $(call cc-option,-fno-conserve-stack,) \
 	$(call cc-option,-fno-stack-protector,)
 
 obj-y := core.o debugfs.o report.o
-obj-$(CONFIG_KCSAN_SELFTEST) += test.o
+obj-$(CONFIG_KCSAN_SELFTEST) += selftest.o
 
 CFLAGS_kcsan-test.o := $(CFLAGS_KCSAN) -g -fno-omit-frame-pointer
 obj-$(CONFIG_KCSAN_TEST) += kcsan-test.o
diff --git a/kernel/kcsan/test.c b/kernel/kcsan/selftest.c
similarity index 100%
rename from kernel/kcsan/test.c
rename to kernel/kcsan/selftest.c

From 7e766560e6e2c1cf2782f00e63c31564e4c9f0fe Mon Sep 17 00:00:00 2001
From: Marco Elver <elver@google.com>
Date: Tue, 16 Jun 2020 14:36:24 +0200
Subject: [PATCH 194/502] kcsan: Remove existing special atomic rules

Remove existing special atomic rules from kcsan_is_atomic_special()
because they are no longer needed. Since we rely on the compiler
emitting instrumentation distinguishing volatile accesses, the rules
have become redundant.

Let's keep kcsan_is_atomic_special() around, so that we have an obvious
place to add special rules should the need arise in future.

Signed-off-by: Marco Elver <elver@google.com>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 kernel/kcsan/atomic.h | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/kernel/kcsan/atomic.h b/kernel/kcsan/atomic.h
index be9e625227f3..75fe701f4127 100644
--- a/kernel/kcsan/atomic.h
+++ b/kernel/kcsan/atomic.h
@@ -3,8 +3,7 @@
 #ifndef _KERNEL_KCSAN_ATOMIC_H
 #define _KERNEL_KCSAN_ATOMIC_H
 
-#include <linux/jiffies.h>
-#include <linux/sched.h>
+#include <linux/types.h>
 
 /*
  * Special rules for certain memory where concurrent conflicting accesses are
@@ -13,8 +12,7 @@
  */
 static bool kcsan_is_atomic_special(const volatile void *ptr)
 {
-	/* volatile globals that have been observed in data races. */
-	return ptr == &jiffies || ptr == &current->state;
+	return false;
 }
 
 #endif /* _KERNEL_KCSAN_ATOMIC_H */

From 56b031f0abf55254d47a329010574733fa9a27b8 Mon Sep 17 00:00:00 2001
From: Marco Elver <elver@google.com>
Date: Tue, 16 Jun 2020 14:36:25 +0200
Subject: [PATCH 195/502] kcsan: Add jiffies test to test suite

Add a test that KCSAN nor the compiler gets confused about accesses to
jiffies on different architectures.

Signed-off-by: Marco Elver <elver@google.com>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 kernel/kcsan/kcsan-test.c | 23 +++++++++++++++++++++++
 1 file changed, 23 insertions(+)

diff --git a/kernel/kcsan/kcsan-test.c b/kernel/kcsan/kcsan-test.c
index 3af420ad6ee7..fed6fcb5768c 100644
--- a/kernel/kcsan/kcsan-test.c
+++ b/kernel/kcsan/kcsan-test.c
@@ -366,6 +366,11 @@ static noinline void test_kernel_read_struct_zero_size(void)
 	kcsan_check_read(&test_struct.val[3], 0);
 }
 
+static noinline void test_kernel_jiffies_reader(void)
+{
+	sink_value((long)jiffies);
+}
+
 static noinline void test_kernel_seqlock_reader(void)
 {
 	unsigned int seq;
@@ -817,6 +822,23 @@ static void test_assert_exclusive_access_scoped(struct kunit *test)
 	KUNIT_EXPECT_TRUE(test, match_expect_inscope);
 }
 
+/*
+ * jiffies is special (declared to be volatile) and its accesses are typically
+ * not marked; this test ensures that the compiler nor KCSAN gets confused about
+ * jiffies's declaration on different architectures.
+ */
+__no_kcsan
+static void test_jiffies_noreport(struct kunit *test)
+{
+	bool match_never = false;
+
+	begin_test_checks(test_kernel_jiffies_reader, test_kernel_jiffies_reader);
+	do {
+		match_never = report_available();
+	} while (!end_test_checks(match_never));
+	KUNIT_EXPECT_FALSE(test, match_never);
+}
+
 /* Test that racing accesses in seqlock critical sections are not reported. */
 __no_kcsan
 static void test_seqlock_noreport(struct kunit *test)
@@ -867,6 +889,7 @@ static struct kunit_case kcsan_test_cases[] = {
 	KCSAN_KUNIT_CASE(test_assert_exclusive_bits_nochange),
 	KCSAN_KUNIT_CASE(test_assert_exclusive_writer_scoped),
 	KCSAN_KUNIT_CASE(test_assert_exclusive_access_scoped),
+	KCSAN_KUNIT_CASE(test_jiffies_noreport),
 	KCSAN_KUNIT_CASE(test_seqlock_noreport),
 	{},
 };

From e68dcd8eac63cf14745df0dc872ea479df8ed4b9 Mon Sep 17 00:00:00 2001
From: Marco Elver <elver@google.com>
Date: Thu, 18 Jun 2020 11:31:16 +0200
Subject: [PATCH 196/502] kcsan: Re-add GCC as a supported compiler

GCC version 11 recently implemented all requirements to correctly
support KCSAN:

1. Correct no_sanitize-attribute inlining behaviour:
   https://gcc.gnu.org/git/?p=gcc.git;a=commit;h=4089df8ef4a63126b0774c39b6638845244c20d2

2. --param=tsan-distinguish-volatile
   https://gcc.gnu.org/git/?p=gcc.git;a=commit;h=ab2789ec507a94f1a75a6534bca51c7b39037ce0

3. --param=tsan-instrument-func-entry-exit
   https://gcc.gnu.org/git/?p=gcc.git;a=commit;h=06712fc68dc9843d9af7c7ac10047f49d305ad76

Therefore, we can re-enable GCC for KCSAN, and document the new compiler
requirements.

Signed-off-by: Marco Elver <elver@google.com>
Cc: Martin Liska <mliska@suse.cz>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 Documentation/dev-tools/kcsan.rst | 3 ++-
 lib/Kconfig.kcsan                 | 3 ++-
 scripts/Makefile.kcsan            | 2 +-
 3 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/Documentation/dev-tools/kcsan.rst b/Documentation/dev-tools/kcsan.rst
index b38379f06194..be7a0b0e1f28 100644
--- a/Documentation/dev-tools/kcsan.rst
+++ b/Documentation/dev-tools/kcsan.rst
@@ -8,7 +8,8 @@ approach to detect races. KCSAN's primary purpose is to detect `data races`_.
 Usage
 -----
 
-KCSAN requires Clang version 11 or later.
+KCSAN is supported by both GCC and Clang. With GCC we require version 11 or
+later, and with Clang also require version 11 or later.
 
 To enable KCSAN configure the kernel with::
 
diff --git a/lib/Kconfig.kcsan b/lib/Kconfig.kcsan
index 3f3b5bca7a8f..3d282d51849b 100644
--- a/lib/Kconfig.kcsan
+++ b/lib/Kconfig.kcsan
@@ -4,7 +4,8 @@ config HAVE_ARCH_KCSAN
 	bool
 
 config HAVE_KCSAN_COMPILER
-	def_bool CC_IS_CLANG && $(cc-option,-fsanitize=thread -mllvm -tsan-distinguish-volatile=1)
+	def_bool (CC_IS_CLANG && $(cc-option,-fsanitize=thread -mllvm -tsan-distinguish-volatile=1)) || \
+		 (CC_IS_GCC && $(cc-option,-fsanitize=thread --param tsan-distinguish-volatile=1))
 	help
 	  For the list of compilers that support KCSAN, please see
 	  <file:Documentation/dev-tools/kcsan.rst>.
diff --git a/scripts/Makefile.kcsan b/scripts/Makefile.kcsan
index bd4da1af5953..dd66206f4578 100644
--- a/scripts/Makefile.kcsan
+++ b/scripts/Makefile.kcsan
@@ -6,7 +6,7 @@ ifdef CONFIG_KCSAN
 ifdef CONFIG_CC_IS_CLANG
 cc-param = -mllvm -$(1)
 else
-cc-param = --param -$(1)
+cc-param = --param $(1)
 endif
 
 # Keep most options here optional, to allow enabling more compilers if absence

From 2839a232071f588d334543fb86f5689b43353842 Mon Sep 17 00:00:00 2001
From: Marco Elver <elver@google.com>
Date: Thu, 18 Jun 2020 11:31:17 +0200
Subject: [PATCH 197/502] kcsan: Simplify compiler flags

Simplify the set of compiler flags for the runtime by removing cc-option
from -fno-stack-protector, because all supported compilers support it.
This saves us one compiler invocation during build.

Signed-off-by: Marco Elver <elver@google.com>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 kernel/kcsan/Makefile | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/kernel/kcsan/Makefile b/kernel/kcsan/Makefile
index 092ce58d2e56..fea064afc4f7 100644
--- a/kernel/kcsan/Makefile
+++ b/kernel/kcsan/Makefile
@@ -7,8 +7,8 @@ CFLAGS_REMOVE_core.o = $(CC_FLAGS_FTRACE)
 CFLAGS_REMOVE_debugfs.o = $(CC_FLAGS_FTRACE)
 CFLAGS_REMOVE_report.o = $(CC_FLAGS_FTRACE)
 
-CFLAGS_core.o := $(call cc-option,-fno-conserve-stack,) \
-	$(call cc-option,-fno-stack-protector,)
+CFLAGS_core.o := $(call cc-option,-fno-conserve-stack) \
+	-fno-stack-protector
 
 obj-y := core.o debugfs.o report.o
 obj-$(CONFIG_KCSAN_SELFTEST) += selftest.o

From 61d56d7aa5eca3b909bce51ba8125b0fa44d7e17 Mon Sep 17 00:00:00 2001
From: Marco Elver <elver@google.com>
Date: Thu, 18 Jun 2020 11:31:18 +0200
Subject: [PATCH 198/502] kcsan: Disable branch tracing in core runtime

Disable branch tracing in core KCSAN runtime if branches are being
traced (TRACE_BRANCH_PROFILING). This it to avoid its performance
impact, but also avoid recursion in case KCSAN is enabled for the branch
tracing runtime.

The latter had already been a problem for KASAN:
https://lore.kernel.org/lkml/CANpmjNOeXmD5E3O50Z3MjkiuCYaYOPyi+1rq=GZvEKwBvLR0Ug@mail.gmail.com/

Signed-off-by: Marco Elver <elver@google.com>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 kernel/kcsan/Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/kcsan/Makefile b/kernel/kcsan/Makefile
index fea064afc4f7..65ca5539c470 100644
--- a/kernel/kcsan/Makefile
+++ b/kernel/kcsan/Makefile
@@ -8,7 +8,7 @@ CFLAGS_REMOVE_debugfs.o = $(CC_FLAGS_FTRACE)
 CFLAGS_REMOVE_report.o = $(CC_FLAGS_FTRACE)
 
 CFLAGS_core.o := $(call cc-option,-fno-conserve-stack) \
-	-fno-stack-protector
+	-fno-stack-protector -DDISABLE_BRANCH_PROFILING
 
 obj-y := core.o debugfs.o report.o
 obj-$(CONFIG_KCSAN_SELFTEST) += selftest.o

From 38908de90a8c24c949505958f1d09812bb3b64aa Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@kernel.org>
Date: Wed, 22 Jan 2020 13:38:57 -0800
Subject: [PATCH 199/502] tools/memory-model: Add recent references

This commit updates the list of LKMM-related publications in
Documentation/references.txt.

Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
Acked-by: Andrea Parri <parri.andrea@gmail.com>
---
 .../memory-model/Documentation/references.txt | 21 +++++++++++++++++--
 1 file changed, 19 insertions(+), 2 deletions(-)

diff --git a/tools/memory-model/Documentation/references.txt b/tools/memory-model/Documentation/references.txt
index b177f3e4a614..ecbbaa5396d4 100644
--- a/tools/memory-model/Documentation/references.txt
+++ b/tools/memory-model/Documentation/references.txt
@@ -73,6 +73,18 @@ o	Christopher Pulte, Shaked Flur, Will Deacon, Jon French,
 Linux-kernel memory model
 =========================
 
+o	Jade Alglave, Will Deacon, Boqun Feng, David Howells, Daniel
+	Lustig, Luc Maranget, Paul E. McKenney, Andrea Parri, Nicholas
+	Piggin, Alan Stern, Akira Yokosawa, and Peter Zijlstra.
+	2019. "Calibrating your fear of big bad optimizing compilers"
+	Linux Weekly News.  https://lwn.net/Articles/799218/
+
+o	Jade Alglave, Will Deacon, Boqun Feng, David Howells, Daniel
+	Lustig, Luc Maranget, Paul E. McKenney, Andrea Parri, Nicholas
+	Piggin, Alan Stern, Akira Yokosawa, and Peter Zijlstra.
+	2019. "Who's afraid of a big bad optimizing compiler?"
+	Linux Weekly News.  https://lwn.net/Articles/793253/
+
 o	Jade Alglave, Luc Maranget, Paul E. McKenney, Andrea Parri, and
 	Alan Stern.  2018. "Frightening small children and disconcerting
 	grown-ups: Concurrency in the Linux kernel". In Proceedings of
@@ -88,6 +100,11 @@ o	Jade Alglave, Luc Maranget, Paul E. McKenney, Andrea Parri, and
 	Alan Stern.  2017.  "A formal kernel memory-ordering model (part 2)"
 	Linux Weekly News.  https://lwn.net/Articles/720550/
 
+o	Jade Alglave, Luc Maranget, Paul E. McKenney, Andrea Parri, and
+	Alan Stern.  2017-2019.  "A Formal Model of Linux-Kernel Memory
+	Ordering" (backup material for the LWN articles)
+	https://mirrors.edge.kernel.org/pub/linux/kernel/people/paulmck/LWNLinuxMM/
+
 
 Memory-model tooling
 ====================
@@ -110,5 +127,5 @@ Memory-model comparisons
 ========================
 
 o	Paul E. McKenney, Ulrich Weigand, Andrea Parri, and Boqun
-	Feng. 2016. "Linux-Kernel Memory Model". (6 June 2016).
-	http://open-std.org/JTC1/SC22/WG21/docs/papers/2016/p0124r2.html.
+	Feng. 2018. "Linux-Kernel Memory Model". (27 September 2018).
+	http://www.open-std.org/jtc1/sc22/wg21/docs/papers/2018/p0124r6.html.

From c1b14609013a6b4c4b2d73583bde645540ebd9b7 Mon Sep 17 00:00:00 2001
From: Marco Elver <elver@google.com>
Date: Mon, 2 Mar 2020 18:21:01 +0100
Subject: [PATCH 200/502] tools/memory-model: Fix "conflict" definition

The definition of "conflict" should not include the type of access nor
whether the accesses are concurrent or not, which this patch addresses.
The definition of "data race" remains unchanged.

The definition of "conflict" as we know it and is cited by various
papers on memory consistency models appeared in [1]: "Two accesses to
the same variable conflict if at least one is a write; two operations
conflict if they execute conflicting accesses."

The LKMM as well as the C11 memory model are adaptations of
data-race-free, which are based on the work in [2]. Necessarily, we need
both conflicting data operations (plain) and synchronization operations
(marked). For example, C11's definition is based on [3], which defines a
"data race" as: "Two memory operations conflict if they access the same
memory location, and at least one of them is a store, atomic store, or
atomic read-modify-write operation. In a sequentially consistent
execution, two memory operations from different threads form a type 1
data race if they conflict, at least one of them is a data operation,
and they are adjacent in <T (i.e., they may be executed concurrently)."

[1] D. Shasha, M. Snir, "Efficient and Correct Execution of Parallel
    Programs that Share Memory", 1988.
	URL: http://snir.cs.illinois.edu/listed/J21.pdf

[2] S. Adve, "Designing Memory Consistency Models for Shared-Memory
    Multiprocessors", 1993.
	URL: http://sadve.cs.illinois.edu/Publications/thesis.pdf

[3] H.-J. Boehm, S. Adve, "Foundations of the C++ Concurrency Memory
    Model", 2008.
	URL: https://www.hpl.hp.com/techreports/2008/HPL-2008-56.pdf

Signed-off-by: Marco Elver <elver@google.com>
Co-developed-by: Alan Stern <stern@rowland.harvard.edu>
Signed-off-by: Alan Stern <stern@rowland.harvard.edu>
Acked-by: Andrea Parri <parri.andrea@gmail.com>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 .../Documentation/explanation.txt             | 83 ++++++++++---------
 1 file changed, 45 insertions(+), 38 deletions(-)

diff --git a/tools/memory-model/Documentation/explanation.txt b/tools/memory-model/Documentation/explanation.txt
index e91a2eb19592..993f800659c6 100644
--- a/tools/memory-model/Documentation/explanation.txt
+++ b/tools/memory-model/Documentation/explanation.txt
@@ -1987,28 +1987,36 @@ outcome undefined.
 
 In technical terms, the compiler is allowed to assume that when the
 program executes, there will not be any data races.  A "data race"
-occurs when two conflicting memory accesses execute concurrently;
-two memory accesses "conflict" if:
+occurs when there are two memory accesses such that:
 
-	they access the same location,
+1.	they access the same location,
 
-	they occur on different CPUs (or in different threads on the
-	same CPU),
+2.	at least one of them is a store,
 
-	at least one of them is a plain access,
+3.	at least one of them is plain,
 
-	and at least one of them is a store.
+4.	they occur on different CPUs (or in different threads on the
+	same CPU), and
 
-The LKMM tries to determine whether a program contains two conflicting
-accesses which may execute concurrently; if it does then the LKMM says
-there is a potential data race and makes no predictions about the
-program's outcome.
+5.	they execute concurrently.
 
-Determining whether two accesses conflict is easy; you can see that
-all the concepts involved in the definition above are already part of
-the memory model.  The hard part is telling whether they may execute
-concurrently.  The LKMM takes a conservative attitude, assuming that
-accesses may be concurrent unless it can prove they cannot.
+In the literature, two accesses are said to "conflict" if they satisfy
+1 and 2 above.  We'll go a little farther and say that two accesses
+are "race candidates" if they satisfy 1 - 4.  Thus, whether or not two
+race candidates actually do race in a given execution depends on
+whether they are concurrent.
+
+The LKMM tries to determine whether a program contains race candidates
+which may execute concurrently; if it does then the LKMM says there is
+a potential data race and makes no predictions about the program's
+outcome.
+
+Determining whether two accesses are race candidates is easy; you can
+see that all the concepts involved in the definition above are already
+part of the memory model.  The hard part is telling whether they may
+execute concurrently.  The LKMM takes a conservative attitude,
+assuming that accesses may be concurrent unless it can prove they
+are not.
 
 If two memory accesses aren't concurrent then one must execute before
 the other.  Therefore the LKMM decides two accesses aren't concurrent
@@ -2171,8 +2179,8 @@ again, now using plain accesses for buf:
 	}
 
 This program does not contain a data race.  Although the U and V
-accesses conflict, the LKMM can prove they are not concurrent as
-follows:
+accesses are race candidates, the LKMM can prove they are not
+concurrent as follows:
 
 	The smp_wmb() fence in P0 is both a compiler barrier and a
 	cumul-fence.  It guarantees that no matter what hash of
@@ -2326,12 +2334,11 @@ could now perform the load of x before the load of ptr (there might be
 a control dependency but no address dependency at the machine level).
 
 Finally, it turns out there is a situation in which a plain write does
-not need to be w-post-bounded: when it is separated from the
-conflicting access by a fence.  At first glance this may seem
-impossible.  After all, to be conflicting the second access has to be
-on a different CPU from the first, and fences don't link events on
-different CPUs.  Well, normal fences don't -- but rcu-fence can!
-Here's an example:
+not need to be w-post-bounded: when it is separated from the other
+race-candidate access by a fence.  At first glance this may seem
+impossible.  After all, to be race candidates the two accesses must
+be on different CPUs, and fences don't link events on different CPUs.
+Well, normal fences don't -- but rcu-fence can!  Here's an example:
 
 	int x, y;
 
@@ -2367,7 +2374,7 @@ concurrent and there is no race, even though P1's plain store to y
 isn't w-post-bounded by any marked accesses.
 
 Putting all this material together yields the following picture.  For
-two conflicting stores W and W', where W ->co W', the LKMM says the
+race-candidate stores W and W', where W ->co W', the LKMM says the
 stores don't race if W can be linked to W' by a
 
 	w-post-bounded ; vis ; w-pre-bounded
@@ -2380,8 +2387,8 @@ sequence, and if W' is plain then they also have to be linked by a
 
 	w-post-bounded ; vis ; r-pre-bounded
 
-sequence.  For a conflicting load R and store W, the LKMM says the two
-accesses don't race if R can be linked to W by an
+sequence.  For race-candidate load R and store W, the LKMM says the
+two accesses don't race if R can be linked to W by an
 
 	r-post-bounded ; xb* ; w-pre-bounded
 
@@ -2413,20 +2420,20 @@ is, the rules governing the memory subsystem's choice of a store to
 satisfy a load request and its determination of where a store will
 fall in the coherence order):
 
-	If R and W conflict and it is possible to link R to W by one
-	of the xb* sequences listed above, then W ->rfe R is not
-	allowed (i.e., a load cannot read from a store that it
+	If R and W are race candidates and it is possible to link R to
+	W by one of the xb* sequences listed above, then W ->rfe R is
+	not allowed (i.e., a load cannot read from a store that it
 	executes before, even if one or both is plain).
 
-	If W and R conflict and it is possible to link W to R by one
-	of the vis sequences listed above, then R ->fre W is not
-	allowed (i.e., if a store is visible to a load then the load
-	must read from that store or one coherence-after it).
+	If W and R are race candidates and it is possible to link W to
+	R by one of the vis sequences listed above, then R ->fre W is
+	not allowed (i.e., if a store is visible to a load then the
+	load must read from that store or one coherence-after it).
 
-	If W and W' conflict and it is possible to link W to W' by one
-	of the vis sequences listed above, then W' ->co W is not
-	allowed (i.e., if one store is visible to a second then the
-	second must come after the first in the coherence order).
+	If W and W' are race candidates and it is possible to link W
+	to W' by one of the vis sequences listed above, then W' ->co W
+	is not allowed (i.e., if one store is visible to a second then
+	the second must come after the first in the coherence order).
 
 This is the extent to which the LKMM deals with plain accesses.
 Perhaps it could say more (for example, plain accesses might

From be4a37973cb078fc64d541f396b7d4d80e45fbe2 Mon Sep 17 00:00:00 2001
From: "Joel Fernandes (Google)" <joel@joelfernandes.org>
Date: Sun, 22 Mar 2020 21:57:33 -0400
Subject: [PATCH 201/502] Documentation: LKMM: Add litmus test for RCU GP
 guarantee where updater frees object

This adds an example for the important RCU grace period guarantee, which
shows an RCU reader can never span a grace period.

Acked-by: Andrea Parri <parri.andrea@gmail.com>
Signed-off-by: Joel Fernandes (Google) <joel@joelfernandes.org>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 .../litmus-tests/rcu/RCU+sync+free.litmus     | 42 +++++++++++++++++++
 1 file changed, 42 insertions(+)
 create mode 100644 Documentation/litmus-tests/rcu/RCU+sync+free.litmus

diff --git a/Documentation/litmus-tests/rcu/RCU+sync+free.litmus b/Documentation/litmus-tests/rcu/RCU+sync+free.litmus
new file mode 100644
index 000000000000..4ee67e12f513
--- /dev/null
+++ b/Documentation/litmus-tests/rcu/RCU+sync+free.litmus
@@ -0,0 +1,42 @@
+C RCU+sync+free
+
+(*
+ * Result: Never
+ *
+ * This litmus test demonstrates that an RCU reader can never see a write that
+ * follows a grace period, if it did not see writes that precede that grace
+ * period.
+ *
+ * This is a typical pattern of RCU usage, where the write before the grace
+ * period assigns a pointer, and the writes following the grace period destroy
+ * the object that the pointer used to point to.
+ *
+ * This is one implication of the RCU grace-period guarantee, which says (among
+ * other things) that an RCU read-side critical section cannot span a grace period.
+ *)
+
+{
+int x = 1;
+int *y = &x;
+int z = 1;
+}
+
+P0(int *x, int *z, int **y)
+{
+	int *r0;
+	int r1;
+
+	rcu_read_lock();
+	r0 = rcu_dereference(*y);
+	r1 = READ_ONCE(*r0);
+	rcu_read_unlock();
+}
+
+P1(int *x, int *z, int **y)
+{
+	rcu_assign_pointer(*y, z);
+	synchronize_rcu();
+	WRITE_ONCE(*x, 0);
+}
+
+exists (0:r0=x /\ 0:r1=0)

From a591890c4e91f37ce858a3090b16e0eef2511575 Mon Sep 17 00:00:00 2001
From: "Joel Fernandes (Google)" <joel@joelfernandes.org>
Date: Sun, 22 Mar 2020 21:57:34 -0400
Subject: [PATCH 202/502] Documentation: LKMM: Add litmus test for RCU GP
 guarantee where reader stores

This adds an example for the important RCU grace period guarantee, which
shows an RCU reader can never span a grace period.

Acked-by: Andrea Parri <parri.andrea@gmail.com>
Signed-off-by: Joel Fernandes (Google) <joel@joelfernandes.org>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 Documentation/litmus-tests/README             | 11 ++++++
 .../litmus-tests/rcu/RCU+sync+read.litmus     | 37 +++++++++++++++++++
 2 files changed, 48 insertions(+)
 create mode 100644 Documentation/litmus-tests/README
 create mode 100644 Documentation/litmus-tests/rcu/RCU+sync+read.litmus

diff --git a/Documentation/litmus-tests/README b/Documentation/litmus-tests/README
new file mode 100644
index 000000000000..c4307ea9f996
--- /dev/null
+++ b/Documentation/litmus-tests/README
@@ -0,0 +1,11 @@
+============
+LITMUS TESTS
+============
+
+RCU (/rcu directory)
+--------------------
+
+RCU+sync+read.litmus
+RCU+sync+free.litmus
+    Both the above litmus tests demonstrate the RCU grace period guarantee
+    that an RCU read-side critical section can never span a grace period.
diff --git a/Documentation/litmus-tests/rcu/RCU+sync+read.litmus b/Documentation/litmus-tests/rcu/RCU+sync+read.litmus
new file mode 100644
index 000000000000..f34176720231
--- /dev/null
+++ b/Documentation/litmus-tests/rcu/RCU+sync+read.litmus
@@ -0,0 +1,37 @@
+C RCU+sync+read
+
+(*
+ * Result: Never
+ *
+ * This litmus test demonstrates that after a grace period, an RCU updater always
+ * sees all stores done in prior RCU read-side critical sections. Such
+ * read-side critical sections would have ended before the grace period ended.
+ *
+ * This is one implication of the RCU grace-period guarantee, which says (among
+ * other things) that an RCU read-side critical section cannot span a grace period.
+ *)
+
+{
+int x = 0;
+int y = 0;
+}
+
+P0(int *x, int *y)
+{
+	rcu_read_lock();
+	WRITE_ONCE(*x, 1);
+	WRITE_ONCE(*y, 1);
+	rcu_read_unlock();
+}
+
+P1(int *x, int *y)
+{
+	int r0;
+	int r1;
+
+	r0 = READ_ONCE(*x);
+	synchronize_rcu();
+	r1 = READ_ONCE(*y);
+}
+
+exists (1:r0=1 /\ 1:r1=0)

From 7f871338ff939952c4e04a83ae395ff9d57040c2 Mon Sep 17 00:00:00 2001
From: "Joel Fernandes (Google)" <joel@joelfernandes.org>
Date: Sun, 22 Mar 2020 21:57:35 -0400
Subject: [PATCH 203/502] MAINTAINERS: Update maintainers for new
 Documentation/litmus-tests

This commit adds Joel Fernandes as official LKMM reviewer.

Acked-by: Boqun Feng <boqun.feng@gmail.com>
Acked-by: Andrea Parri <parri.andrea@gmail.com>
Signed-off-by: Joel Fernandes (Google) <joel@joelfernandes.org>
[ paulmck: Apply Joe Perches alphabetization feedback. ]
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 MAINTAINERS | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/MAINTAINERS b/MAINTAINERS
index 496fd4eafb68..b2578efb6c0e 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -9960,6 +9960,7 @@ M:	Luc Maranget <luc.maranget@inria.fr>
 M:	"Paul E. McKenney" <paulmck@kernel.org>
 R:	Akira Yokosawa <akiyks@gmail.com>
 R:	Daniel Lustig <dlustig@nvidia.com>
+R:	Joel Fernandes <joel@joelfernandes.org>
 L:	linux-kernel@vger.kernel.org
 L:	linux-arch@vger.kernel.org
 S:	Supported
@@ -9968,6 +9969,7 @@ F:	Documentation/atomic_bitops.txt
 F:	Documentation/atomic_t.txt
 F:	Documentation/core-api/atomic_ops.rst
 F:	Documentation/core-api/refcount-vs-atomic.rst
+F:	Documentation/litmus-tests/
 F:	Documentation/memory-barriers.txt
 F:	tools/memory-model/
 

From 4a9cc65f7a715ba1f4f58529f7bf6f1548d8701f Mon Sep 17 00:00:00 2001
From: Boqun Feng <boqun.feng@gmail.com>
Date: Thu, 26 Mar 2020 10:40:19 +0800
Subject: [PATCH 204/502] tools/memory-model: Add an exception for limitations
 on _unless() family

According to Luc, atomic_add_unless() is directly provided by herd7,
therefore it can be used in litmus tests. So change the limitation
section in README to unlimit the use of atomic_add_unless().

Cc: Luc Maranget <luc.maranget@inria.fr>
Acked-by: Andrea Parri <parri.andrea@gmail.com>
Reviewed-by: Joel Fernandes (Google) <joel@joelfernandes.org>
Signed-off-by: Boqun Feng <boqun.feng@gmail.com>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 tools/memory-model/README | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/tools/memory-model/README b/tools/memory-model/README
index fc07b52f2028..b9c562e92981 100644
--- a/tools/memory-model/README
+++ b/tools/memory-model/README
@@ -207,11 +207,15 @@ The Linux-kernel memory model (LKMM) has the following limitations:
 		case as a store release.
 
 	b.	The "unless" RMW operations are not currently modeled:
-		atomic_long_add_unless(), atomic_add_unless(),
-		atomic_inc_unless_negative(), and
-		atomic_dec_unless_positive().  These can be emulated
+		atomic_long_add_unless(), atomic_inc_unless_negative(),
+		and atomic_dec_unless_positive().  These can be emulated
 		in litmus tests, for example, by using atomic_cmpxchg().
 
+		One exception of this limitation is atomic_add_unless(),
+		which is provided directly by herd7 (so no corresponding
+		definition in linux-kernel.def).  atomic_add_unless() is
+		modeled by herd7 therefore it can be used in litmus tests.
+
 	c.	The call_rcu() function is not modeled.  It can be
 		emulated in litmus tests by adding another process that
 		invokes synchronize_rcu() and the body of the callback

From efff6150209694a78c8af8c2a7557af682086220 Mon Sep 17 00:00:00 2001
From: Boqun Feng <boqun.feng@gmail.com>
Date: Thu, 26 Mar 2020 10:40:20 +0800
Subject: [PATCH 205/502] Documentation/litmus-tests: Introduce atomic
 directory

Although we have atomic_t.txt and its friends to describe the semantics
of atomic APIs and lib/atomic64_test.c for build testing and testing in
UP mode, the tests for our atomic APIs in real SMP mode are still
missing. Since now we have the LKMM tool in kernel and litmus tests can
be used to generate kernel modules for testing purpose with "klitmus" (a
tool from the LKMM toolset), it makes sense to put a few typical litmus
tests into kernel so that

1)	they are the examples to describe the conceptual mode of the
	semantics of atomic APIs, and

2)	they can be used to generate kernel test modules for anyone
	who is interested to test the atomic APIs implementation (in
	most cases, is the one who implements the APIs for a new arch)

Therefore, introduce the atomic directory for this purpose. The
directory is maintained by the LKMM group to make sure the litmus tests
are always aligned with our memory model.

Acked-by: Alan Stern <stern@rowland.harvard.edu>
Acked-by: Andrea Parri <parri.andrea@gmail.com>
Reviewed-by: Joel Fernandes (Google) <joel@joelfernandes.org>
Signed-off-by: Boqun Feng <boqun.feng@gmail.com>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 Documentation/litmus-tests/atomic/README | 4 ++++
 1 file changed, 4 insertions(+)
 create mode 100644 Documentation/litmus-tests/atomic/README

diff --git a/Documentation/litmus-tests/atomic/README b/Documentation/litmus-tests/atomic/README
new file mode 100644
index 000000000000..ae61201a4271
--- /dev/null
+++ b/Documentation/litmus-tests/atomic/README
@@ -0,0 +1,4 @@
+This directory contains litmus tests that are typical to describe the semantics
+of our atomic APIs. For more information about how to "run" a litmus test or
+how to generate a kernel test module based on a litmus test, please see
+tools/memory-model/README.

From 4dcd4d36ddb1fa7fa7257ffe9e711608119b9785 Mon Sep 17 00:00:00 2001
From: Boqun Feng <boqun.feng@gmail.com>
Date: Thu, 26 Mar 2020 10:40:21 +0800
Subject: [PATCH 206/502] Documentation/litmus-tests/atomic: Add a test for
 atomic_set()

We already use a litmus test in atomic_t.txt to describe the behavior of
an atomic_set() with the an atomic RMW, so add it into atomic-tests
directory to make it easily accessible for anyone who cares about the
semantics of our atomic APIs.

Besides currently the litmus test "atomic-set" in atomic_t.txt has a few
things to be improved:

1)	The CPU/Processor numbers "P1,P2" are not only inconsistent with
	the rest of the document, which uses "CPU0" and "CPU1", but also
	unacceptable by the herd tool, which requires processors start
	at "P0".

2)	The initialization block uses a "atomic_set()", which is OK, but
	it's better to use ATOMIC_INIT() to make clear this is an
	initialization.

3)	The return value of atomic_add_unless() is discarded
	inexplicitly, which is OK for C language, but it will be helpful
	to the herd tool if we use a void cast to make the discard
	explicit.

4)	The name and the paragraph describing the test need to be more
	accurate and aligned with our wording in LKMM.

Therefore fix these in both atomic_t.txt and the new added litmus test.

Acked-by: Andrea Parri <parri.andrea@gmail.com>
Acked-by: Alan Stern <stern@rowland.harvard.edu>
Signed-off-by: Boqun Feng <boqun.feng@gmail.com>
Reviewed-by: Joel Fernandes (Google) <joel@joelfernandes.org>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 Documentation/atomic_t.txt                    | 18 +++++++-------
 ...c-RMW-ops-are-atomic-WRT-atomic_set.litmus | 24 +++++++++++++++++++
 Documentation/litmus-tests/atomic/README      |  7 ++++++
 3 files changed, 40 insertions(+), 9 deletions(-)
 create mode 100644 Documentation/litmus-tests/atomic/Atomic-RMW-ops-are-atomic-WRT-atomic_set.litmus

diff --git a/Documentation/atomic_t.txt b/Documentation/atomic_t.txt
index 0ab747e0d5ac..67d1d99f8589 100644
--- a/Documentation/atomic_t.txt
+++ b/Documentation/atomic_t.txt
@@ -85,21 +85,21 @@ smp_store_release() respectively. Therefore, if you find yourself only using
 the Non-RMW operations of atomic_t, you do not in fact need atomic_t at all
 and are doing it wrong.
 
-A subtle detail of atomic_set{}() is that it should be observable to the RMW
-ops. That is:
+A note for the implementation of atomic_set{}() is that it must not break the
+atomicity of the RMW ops. That is:
 
-  C atomic-set
+  C Atomic-RMW-ops-are-atomic-WRT-atomic_set
 
   {
-    atomic_set(v, 1);
+    atomic_t v = ATOMIC_INIT(1);
+  }
+
+  P0(atomic_t *v)
+  {
+    (void)atomic_add_unless(v, 1, 0);
   }
 
   P1(atomic_t *v)
-  {
-    atomic_add_unless(v, 1, 0);
-  }
-
-  P2(atomic_t *v)
   {
     atomic_set(v, 0);
   }
diff --git a/Documentation/litmus-tests/atomic/Atomic-RMW-ops-are-atomic-WRT-atomic_set.litmus b/Documentation/litmus-tests/atomic/Atomic-RMW-ops-are-atomic-WRT-atomic_set.litmus
new file mode 100644
index 000000000000..49385314d911
--- /dev/null
+++ b/Documentation/litmus-tests/atomic/Atomic-RMW-ops-are-atomic-WRT-atomic_set.litmus
@@ -0,0 +1,24 @@
+C Atomic-RMW-ops-are-atomic-WRT-atomic_set
+
+(*
+ * Result: Never
+ *
+ * Test that atomic_set() cannot break the atomicity of atomic RMWs.
+ *)
+
+{
+	atomic_t v = ATOMIC_INIT(1);
+}
+
+P0(atomic_t *v)
+{
+	(void)atomic_add_unless(v, 1, 0);
+}
+
+P1(atomic_t *v)
+{
+	atomic_set(v, 0);
+}
+
+exists
+(v=2)
diff --git a/Documentation/litmus-tests/atomic/README b/Documentation/litmus-tests/atomic/README
index ae61201a4271..a1b72410b539 100644
--- a/Documentation/litmus-tests/atomic/README
+++ b/Documentation/litmus-tests/atomic/README
@@ -2,3 +2,10 @@ This directory contains litmus tests that are typical to describe the semantics
 of our atomic APIs. For more information about how to "run" a litmus test or
 how to generate a kernel test module based on a litmus test, please see
 tools/memory-model/README.
+
+============
+LITMUS TESTS
+============
+
+Atomic-RMW-ops-are-atomic-WRT-atomic_set.litmus
+	Test that atomic_set() cannot break the atomicity of atomic RMWs.

From e30d02355536e9678ab8a4dfcd6e90a86479b10f Mon Sep 17 00:00:00 2001
From: Boqun Feng <boqun.feng@gmail.com>
Date: Thu, 26 Mar 2020 10:40:22 +0800
Subject: [PATCH 207/502] Documentation/litmus-tests/atomic: Add a test for
 smp_mb__after_atomic()

We already use a litmus test in atomic_t.txt to describe atomic RMW +
smp_mb__after_atomic() is stronger than acquire (both the read and the
write parts are ordered). So make it a litmus test in atomic-tests
directory, so that people can access the litmus easily.

Additionally, change the processor numbers "P1, P2" to "P0, P1" in
atomic_t.txt for the consistency with the processor numbers in the
litmus test, which herd can handle.

Acked-by: Alan Stern <stern@rowland.harvard.edu>
Acked-by: Andrea Parri <parri.andrea@gmail.com>
Signed-off-by: Boqun Feng <boqun.feng@gmail.com>
Reviewed-by: Joel Fernandes (Google) <joel@joelfernandes.org>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 Documentation/atomic_t.txt                    | 10 +++---
 ...ter_atomic-is-stronger-than-acquire.litmus | 32 +++++++++++++++++++
 Documentation/litmus-tests/atomic/README      |  5 +++
 3 files changed, 42 insertions(+), 5 deletions(-)
 create mode 100644 Documentation/litmus-tests/atomic/Atomic-RMW+mb__after_atomic-is-stronger-than-acquire.litmus

diff --git a/Documentation/atomic_t.txt b/Documentation/atomic_t.txt
index 67d1d99f8589..0f1fdedf36bb 100644
--- a/Documentation/atomic_t.txt
+++ b/Documentation/atomic_t.txt
@@ -233,19 +233,19 @@ as well. Similarly, something like:
 is an ACQUIRE pattern (though very much not typical), but again the barrier is
 strictly stronger than ACQUIRE. As illustrated:
 
-  C strong-acquire
+  C Atomic-RMW+mb__after_atomic-is-stronger-than-acquire
 
   {
   }
 
-  P1(int *x, atomic_t *y)
+  P0(int *x, atomic_t *y)
   {
     r0 = READ_ONCE(*x);
     smp_rmb();
     r1 = atomic_read(y);
   }
 
-  P2(int *x, atomic_t *y)
+  P1(int *x, atomic_t *y)
   {
     atomic_inc(y);
     smp_mb__after_atomic();
@@ -253,14 +253,14 @@ strictly stronger than ACQUIRE. As illustrated:
   }
 
   exists
-  (r0=1 /\ r1=0)
+  (0:r0=1 /\ 0:r1=0)
 
 This should not happen; but a hypothetical atomic_inc_acquire() --
 (void)atomic_fetch_inc_acquire() for instance -- would allow the outcome,
 because it would not order the W part of the RMW against the following
 WRITE_ONCE.  Thus:
 
-  P1			P2
+  P0			P1
 
 			t = LL.acq *y (0)
 			t++;
diff --git a/Documentation/litmus-tests/atomic/Atomic-RMW+mb__after_atomic-is-stronger-than-acquire.litmus b/Documentation/litmus-tests/atomic/Atomic-RMW+mb__after_atomic-is-stronger-than-acquire.litmus
new file mode 100644
index 000000000000..9a8e31a44b28
--- /dev/null
+++ b/Documentation/litmus-tests/atomic/Atomic-RMW+mb__after_atomic-is-stronger-than-acquire.litmus
@@ -0,0 +1,32 @@
+C Atomic-RMW+mb__after_atomic-is-stronger-than-acquire
+
+(*
+ * Result: Never
+ *
+ * Test that an atomic RMW followed by a smp_mb__after_atomic() is
+ * stronger than a normal acquire: both the read and write parts of
+ * the RMW are ordered before the subsequential memory accesses.
+ *)
+
+{
+}
+
+P0(int *x, atomic_t *y)
+{
+	int r0;
+	int r1;
+
+	r0 = READ_ONCE(*x);
+	smp_rmb();
+	r1 = atomic_read(y);
+}
+
+P1(int *x, atomic_t *y)
+{
+	atomic_inc(y);
+	smp_mb__after_atomic();
+	WRITE_ONCE(*x, 1);
+}
+
+exists
+(0:r0=1 /\ 0:r1=0)
diff --git a/Documentation/litmus-tests/atomic/README b/Documentation/litmus-tests/atomic/README
index a1b72410b539..714cf93816ea 100644
--- a/Documentation/litmus-tests/atomic/README
+++ b/Documentation/litmus-tests/atomic/README
@@ -7,5 +7,10 @@ tools/memory-model/README.
 LITMUS TESTS
 ============
 
+Atomic-RMW+mb__after_atomic-is-stronger-than-acquire
+	Test that an atomic RMW followed by a smp_mb__after_atomic() is
+	stronger than a normal acquire: both the read and write parts of
+	the RMW are ordered before the subsequential memory accesses.
+
 Atomic-RMW-ops-are-atomic-WRT-atomic_set.litmus
 	Test that atomic_set() cannot break the atomicity of atomic RMWs.

From 9725dd55512772422e195cf0cfbca1eda6778358 Mon Sep 17 00:00:00 2001
From: Akira Yokosawa <akiyks@gmail.com>
Date: Sun, 10 May 2020 13:37:14 +0900
Subject: [PATCH 208/502] tools/memory-model: Fix reference to litmus test in
 recipes.txt

The name of litmus test doesn't match the one described below.
Fix the name of litmus test.

Acked-by: Andrea Parri <parri.andrea@gmail.com>
Acked-by: Joel Fernandes (Google) <joel@joelfernandes.org>
Signed-off-by: Akira Yokosawa <akiyks@gmail.com>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 tools/memory-model/Documentation/recipes.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/memory-model/Documentation/recipes.txt b/tools/memory-model/Documentation/recipes.txt
index 7fe8d7aa3029..63c4adfed884 100644
--- a/tools/memory-model/Documentation/recipes.txt
+++ b/tools/memory-model/Documentation/recipes.txt
@@ -126,7 +126,7 @@ However, it is not necessarily the case that accesses ordered by
 locking will be seen as ordered by CPUs not holding that lock.
 Consider this example:
 
-	/* See Z6.0+pooncerelease+poacquirerelease+fencembonceonce.litmus. */
+	/* See Z6.0+pooncelock+pooncelock+pombonce.litmus. */
 	void CPU0(void)
 	{
 		spin_lock(&mylock);

From cdaac9d6d23d7a7f9edbb568191d05f2b660fff0 Mon Sep 17 00:00:00 2001
From: Akira Yokosawa <akiyks@gmail.com>
Date: Sun, 10 May 2020 15:12:57 +0900
Subject: [PATCH 209/502] Documentation/litmus-tests: Merge atomic's README
 into top-level one

Where Documentation/litmus-tests/README lists RCU litmus tests,
Documentation/litmus-tests/atomic/README lists atomic litmus tests.
For symmetry, merge the latter into former, with some context
adjustment in the introduction.

Acked-by: Andrea Parri <parri.andrea@gmail.com>
Acked-by: Joel Fernandes (Google) <joel@joelfernandes.org>
Acked-by: Boqun Feng <boqun.feng@gmail.com>
Signed-off-by: Akira Yokosawa <akiyks@gmail.com>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 Documentation/litmus-tests/README        | 19 +++++++++++++++++++
 Documentation/litmus-tests/atomic/README | 16 ----------------
 2 files changed, 19 insertions(+), 16 deletions(-)
 delete mode 100644 Documentation/litmus-tests/atomic/README

diff --git a/Documentation/litmus-tests/README b/Documentation/litmus-tests/README
index c4307ea9f996..ac0b270b456c 100644
--- a/Documentation/litmus-tests/README
+++ b/Documentation/litmus-tests/README
@@ -2,6 +2,25 @@
 LITMUS TESTS
 ============
 
+Each subdirectory contains litmus tests that are typical to describe the
+semantics of respective kernel APIs.
+For more information about how to "run" a litmus test or how to generate
+a kernel test module based on a litmus test, please see
+tools/memory-model/README.
+
+
+atomic (/atomic derectory)
+--------------------------
+
+Atomic-RMW+mb__after_atomic-is-stronger-than-acquire.litmus
+    Test that an atomic RMW followed by a smp_mb__after_atomic() is
+    stronger than a normal acquire: both the read and write parts of
+    the RMW are ordered before the subsequential memory accesses.
+
+Atomic-RMW-ops-are-atomic-WRT-atomic_set.litmus
+    Test that atomic_set() cannot break the atomicity of atomic RMWs.
+
+
 RCU (/rcu directory)
 --------------------
 
diff --git a/Documentation/litmus-tests/atomic/README b/Documentation/litmus-tests/atomic/README
deleted file mode 100644
index 714cf93816ea..000000000000
--- a/Documentation/litmus-tests/atomic/README
+++ /dev/null
@@ -1,16 +0,0 @@
-This directory contains litmus tests that are typical to describe the semantics
-of our atomic APIs. For more information about how to "run" a litmus test or
-how to generate a kernel test module based on a litmus test, please see
-tools/memory-model/README.
-
-============
-LITMUS TESTS
-============
-
-Atomic-RMW+mb__after_atomic-is-stronger-than-acquire
-	Test that an atomic RMW followed by a smp_mb__after_atomic() is
-	stronger than a normal acquire: both the read and write parts of
-	the RMW are ordered before the subsequential memory accesses.
-
-Atomic-RMW-ops-are-atomic-WRT-atomic_set.litmus
-	Test that atomic_set() cannot break the atomicity of atomic RMWs.

From c425fb5f8d2c8d22e7baad6dc077703c2b329d2d Mon Sep 17 00:00:00 2001
From: "Joel Fernandes (Google)" <joel@joelfernandes.org>
Date: Mon, 11 May 2020 22:06:46 -0400
Subject: [PATCH 210/502] Documentation/litmus-tests: Cite an RCU litmus test

This commit cites a pertinent RCU-related litmus test.

Co-developed-by: Joel Fernandes (Google) <joel@joelfernandes.org>
Co-developed-by: Akira Yokosawa <akiyks@gmail.com>
[Alan: grammar nit]
[ paulmck: Update commit log and title per Akira feedback. ]
Suggested-by: Alan Stern <stern@rowland.harvard.edu>
Signed-off-by: Joel Fernandes (Google) <joel@joelfernandes.org>
Signed-off-by: Akira Yokosawa <akiyks@gmail.com>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 Documentation/litmus-tests/README | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/Documentation/litmus-tests/README b/Documentation/litmus-tests/README
index ac0b270b456c..b79e640214b9 100644
--- a/Documentation/litmus-tests/README
+++ b/Documentation/litmus-tests/README
@@ -24,6 +24,10 @@ Atomic-RMW-ops-are-atomic-WRT-atomic_set.litmus
 RCU (/rcu directory)
 --------------------
 
+MP+onceassign+derefonce.litmus (under tools/memory-model/litmus-tests/)
+    Demonstrates the use of rcu_assign_pointer() and rcu_dereference() to
+    ensure that an RCU reader will not see pre-initialization garbage.
+
 RCU+sync+read.litmus
 RCU+sync+free.litmus
     Both the above litmus tests demonstrate the RCU grace period guarantee

From d075a78a5ab19389d5600923d6ad5391d7cd1be8 Mon Sep 17 00:00:00 2001
From: Akira Yokosawa <akiyks@gmail.com>
Date: Sun, 31 May 2020 20:04:32 +0900
Subject: [PATCH 211/502] tools/memory-model/README: Expand dependency of
 klitmus7

klitmus7 is independent of the memory model but depends on the
build-target kernel release.
It occasionally lost compatibility due to kernel API changes [1, 2, 3].
It was remedied in a backwards-compatible manner respectively [4, 5, 6].

Reflect this fact in README.

[1]: b899a850431e ("compiler.h: Remove ACCESS_ONCE()")
[2]: 0bb95f80a38f ("Makefile: Globally enable VLA warning")
[3]: d56c0d45f0e2 ("proc: decouple proc from VFS with "struct proc_ops"")
[4]: https://github.com/herd/herdtools7/commit/e87d7f9287d1
     ("klitmus: Use WRITE_ONCE and READ_ONCE in place of deprecated ACCESS_ONCE")
[5]: https://github.com/herd/herdtools7/commit/a0cbb10d02be
     ("klitmus: Avoid variable length array")
[6]: https://github.com/herd/herdtools7/commit/46b9412d3a58
     ("klitmus: Linux kernel v5.6.x compat")

NOTE: [5] was ahead of herdtools7 7.53, which did not make an
official release.  Code generated by klitmus7 without [5] can still be
built targeting Linux 4.20--5.5 if you don't care VLA warnings.

Acked-by: Andrea Parri <parri.andrea@gmail.com>
Signed-off-by: Akira Yokosawa <akiyks@gmail.com>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 tools/memory-model/README | 30 ++++++++++++++++++++++++++++--
 1 file changed, 28 insertions(+), 2 deletions(-)

diff --git a/tools/memory-model/README b/tools/memory-model/README
index b9c562e92981..90af203c3cf1 100644
--- a/tools/memory-model/README
+++ b/tools/memory-model/README
@@ -28,8 +28,34 @@ downloaded separately:
 See "herdtools7/INSTALL.md" for installation instructions.
 
 Note that although these tools usually provide backwards compatibility,
-this is not absolutely guaranteed.  Therefore, if a later version does
-not work, please try using the exact version called out above.
+this is not absolutely guaranteed.
+
+For example, a future version of herd7 might not work with the model
+in this release.  A compatible model will likely be made available in
+a later release of Linux kernel.
+
+If you absolutely need to run the model in this particular release,
+please try using the exact version called out above.
+
+klitmus7 is independent of the model provided here.  It has its own
+dependency on a target kernel release where converted code is built
+and executed.  Any change in kernel APIs essential to klitmus7 will
+necessitate an upgrade of klitmus7.
+
+If you find any compatibility issues in klitmus7, please inform the
+memory model maintainers.
+
+klitmus7 Compatibility Table
+----------------------------
+
+	============  ==========
+	target Linux  herdtools7
+	------------  ----------
+	     -- 4.18  7.48 --
+	4.15 -- 4.19  7.49 --
+	4.20 -- 5.5   7.54 --
+	5.6  --       HEAD
+	============  ==========
 
 
 ==================

From 2bfa5c62debe43e3779e03bfc66b75ab72098db1 Mon Sep 17 00:00:00 2001
From: Akira Yokosawa <akiyks@gmail.com>
Date: Wed, 24 Jun 2020 06:56:43 +0900
Subject: [PATCH 212/502] tools/memory-model/README: Mention herdtools7 7.56 in
 compatibility table

herdtools7 7.56 is going to be released in the week of 22 Jun 2020.
This commit therefore adds the exact version in the compatibility table.

Acked-by: Andrea Parri <parri.andrea@gmail.com>
Signed-off-by: Akira Yokosawa <akiyks@gmail.com>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 tools/memory-model/README | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/memory-model/README b/tools/memory-model/README
index 90af203c3cf1..ecb7385376bf 100644
--- a/tools/memory-model/README
+++ b/tools/memory-model/README
@@ -54,7 +54,7 @@ klitmus7 Compatibility Table
 	     -- 4.18  7.48 --
 	4.15 -- 4.19  7.49 --
 	4.20 -- 5.5   7.54 --
-	5.6  --       HEAD
+	5.6  --       7.56 --
 	============  ==========
 
 

From 5ef0a07a7928539d46fdb163acfad28c6d877a89 Mon Sep 17 00:00:00 2001
From: Akira Yokosawa <akiyks@gmail.com>
Date: Wed, 24 Jun 2020 06:59:26 +0900
Subject: [PATCH 213/502] Documentation/litmus-tests: Add note on herd7 7.56 in
 atomic litmus test

herdtools 7.56 has enhanced herd7's C parser so that the "(void)expr"
construct in Atomic-RMW-ops-are-atomic-WRT-atomic_set.litmus is
accepted.

This is independent of LKMM's cat model, so mention the required
version in the header of the litmus test and its entry in README.

CC: Boqun Feng <boqun.feng@gmail.com>
Reported-by: Andrea Parri <parri.andrea@gmail.com>
Acked-by: Andrea Parri <parri.andrea@gmail.com>
Signed-off-by: Akira Yokosawa <akiyks@gmail.com>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 Documentation/litmus-tests/README                                | 1 +
 .../atomic/Atomic-RMW-ops-are-atomic-WRT-atomic_set.litmus       | 1 +
 2 files changed, 2 insertions(+)

diff --git a/Documentation/litmus-tests/README b/Documentation/litmus-tests/README
index b79e640214b9..7f5c6c3ed6c3 100644
--- a/Documentation/litmus-tests/README
+++ b/Documentation/litmus-tests/README
@@ -19,6 +19,7 @@ Atomic-RMW+mb__after_atomic-is-stronger-than-acquire.litmus
 
 Atomic-RMW-ops-are-atomic-WRT-atomic_set.litmus
     Test that atomic_set() cannot break the atomicity of atomic RMWs.
+    NOTE: Require herd7 7.56 or later which supports "(void)expr".
 
 
 RCU (/rcu directory)
diff --git a/Documentation/litmus-tests/atomic/Atomic-RMW-ops-are-atomic-WRT-atomic_set.litmus b/Documentation/litmus-tests/atomic/Atomic-RMW-ops-are-atomic-WRT-atomic_set.litmus
index 49385314d911..ffd4d3e79c4a 100644
--- a/Documentation/litmus-tests/atomic/Atomic-RMW-ops-are-atomic-WRT-atomic_set.litmus
+++ b/Documentation/litmus-tests/atomic/Atomic-RMW-ops-are-atomic-WRT-atomic_set.litmus
@@ -4,6 +4,7 @@ C Atomic-RMW-ops-are-atomic-WRT-atomic_set
  * Result: Never
  *
  * Test that atomic_set() cannot break the atomicity of atomic RMWs.
+ * NOTE: This requires herd7 7.56 or later which supports "(void)expr".
  *)
 
 {

From 7c86ffeeed303187f266ed17bd87a9b375955709 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Mon, 29 Jun 2020 13:12:59 +0300
Subject: [PATCH 214/502] io_uring: deduplicate freeing linked timeouts

Linked timeout cancellation code is repeated in in io_req_link_next()
and io_fail_links(), and they differ in details even though shouldn't.
Basing on the fact that there is maximum one armed linked timeout in
a link, and it immediately follows the head, extract a function that
will check for it and defuse.

Justification:
- DRY and cleaner
- better inlining for io_req_link_next() (just 1 call site now)
- isolates linked_timeouts from common path
- reduces time under spinlock for failed links
- actually less code

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
[axboe: fold in locking fix for io_fail_links()]
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 107 +++++++++++++++++++++++++++-----------------------
 1 file changed, 58 insertions(+), 49 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 92c7e2a96912..a0aea78162a6 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -1552,48 +1552,57 @@ static bool io_link_cancel_timeout(struct io_kiocb *req)
 	return false;
 }
 
-static void io_req_link_next(struct io_kiocb *req, struct io_kiocb **nxtptr)
+static void io_kill_linked_timeout(struct io_kiocb *req)
 {
 	struct io_ring_ctx *ctx = req->ctx;
+	struct io_kiocb *link;
 	bool wake_ev = false;
+	unsigned long flags = 0; /* false positive warning */
+
+	if (!(req->flags & REQ_F_COMP_LOCKED))
+		spin_lock_irqsave(&ctx->completion_lock, flags);
+
+	if (list_empty(&req->link_list))
+		goto out;
+	link = list_first_entry(&req->link_list, struct io_kiocb, link_list);
+	if (link->opcode != IORING_OP_LINK_TIMEOUT)
+		goto out;
+
+	list_del_init(&link->link_list);
+	wake_ev = io_link_cancel_timeout(link);
+	req->flags &= ~REQ_F_LINK_TIMEOUT;
+out:
+	if (!(req->flags & REQ_F_COMP_LOCKED))
+		spin_unlock_irqrestore(&ctx->completion_lock, flags);
+	if (wake_ev)
+		io_cqring_ev_posted(ctx);
+}
+
+static void io_req_link_next(struct io_kiocb *req, struct io_kiocb **nxtptr)
+{
+	struct io_kiocb *nxt;
 
 	/*
 	 * The list should never be empty when we are called here. But could
 	 * potentially happen if the chain is messed up, check to be on the
 	 * safe side.
 	 */
-	while (!list_empty(&req->link_list)) {
-		struct io_kiocb *nxt = list_first_entry(&req->link_list,
-						struct io_kiocb, link_list);
+	if (unlikely(list_empty(&req->link_list)))
+		return;
 
-		if (unlikely((req->flags & REQ_F_LINK_TIMEOUT) &&
-			     (nxt->flags & REQ_F_TIMEOUT))) {
-			list_del_init(&nxt->link_list);
-			wake_ev |= io_link_cancel_timeout(nxt);
-			req->flags &= ~REQ_F_LINK_TIMEOUT;
-			continue;
-		}
-
-		list_del_init(&req->link_list);
-		if (!list_empty(&nxt->link_list))
-			nxt->flags |= REQ_F_LINK_HEAD;
-		*nxtptr = nxt;
-		break;
-	}
-
-	if (wake_ev)
-		io_cqring_ev_posted(ctx);
+	nxt = list_first_entry(&req->link_list, struct io_kiocb, link_list);
+	list_del_init(&req->link_list);
+	if (!list_empty(&nxt->link_list))
+		nxt->flags |= REQ_F_LINK_HEAD;
+	*nxtptr = nxt;
 }
 
 /*
  * Called if REQ_F_LINK_HEAD is set, and we fail the head request
  */
-static void io_fail_links(struct io_kiocb *req)
+static void __io_fail_links(struct io_kiocb *req)
 {
 	struct io_ring_ctx *ctx = req->ctx;
-	unsigned long flags;
-
-	spin_lock_irqsave(&ctx->completion_lock, flags);
 
 	while (!list_empty(&req->link_list)) {
 		struct io_kiocb *link = list_first_entry(&req->link_list,
@@ -1602,18 +1611,29 @@ static void io_fail_links(struct io_kiocb *req)
 		list_del_init(&link->link_list);
 		trace_io_uring_fail_link(req, link);
 
-		if ((req->flags & REQ_F_LINK_TIMEOUT) &&
-		    link->opcode == IORING_OP_LINK_TIMEOUT) {
-			io_link_cancel_timeout(link);
-		} else {
-			io_cqring_fill_event(link, -ECANCELED);
-			__io_double_put_req(link);
-		}
+		io_cqring_fill_event(link, -ECANCELED);
+		__io_double_put_req(link);
 		req->flags &= ~REQ_F_LINK_TIMEOUT;
 	}
 
 	io_commit_cqring(ctx);
-	spin_unlock_irqrestore(&ctx->completion_lock, flags);
+	io_cqring_ev_posted(ctx);
+}
+
+static void io_fail_links(struct io_kiocb *req)
+{
+	struct io_ring_ctx *ctx = req->ctx;
+
+	if (!(req->flags & REQ_F_COMP_LOCKED)) {
+		unsigned long flags;
+
+		spin_lock_irqsave(&ctx->completion_lock, flags);
+		__io_fail_links(req);
+		spin_unlock_irqrestore(&ctx->completion_lock, flags);
+	} else {
+		__io_fail_links(req);
+	}
+
 	io_cqring_ev_posted(ctx);
 }
 
@@ -1623,30 +1643,19 @@ static void io_req_find_next(struct io_kiocb *req, struct io_kiocb **nxt)
 		return;
 	req->flags &= ~REQ_F_LINK_HEAD;
 
+	if (req->flags & REQ_F_LINK_TIMEOUT)
+		io_kill_linked_timeout(req);
+
 	/*
 	 * If LINK is set, we have dependent requests in this chain. If we
 	 * didn't fail this request, queue the first one up, moving any other
 	 * dependencies to the next request. In case of failure, fail the rest
 	 * of the chain.
 	 */
-	if (req->flags & REQ_F_FAIL_LINK) {
+	if (req->flags & REQ_F_FAIL_LINK)
 		io_fail_links(req);
-	} else if ((req->flags & (REQ_F_LINK_TIMEOUT | REQ_F_COMP_LOCKED)) ==
-			REQ_F_LINK_TIMEOUT) {
-		struct io_ring_ctx *ctx = req->ctx;
-		unsigned long flags;
-
-		/*
-		 * If this is a timeout link, we could be racing with the
-		 * timeout timer. Grab the completion lock for this case to
-		 * protect against that.
-		 */
-		spin_lock_irqsave(&ctx->completion_lock, flags);
+	else
 		io_req_link_next(req, nxt);
-		spin_unlock_irqrestore(&ctx->completion_lock, flags);
-	} else {
-		io_req_link_next(req, nxt);
-	}
 }
 
 static void __io_req_task_cancel(struct io_kiocb *req, int error)

From 9b5f7bd93272689ec8dc2cfd40a812265c23414e Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Mon, 29 Jun 2020 13:13:00 +0300
Subject: [PATCH 215/502] io_uring: replace find_next() out param with ret

Generally, it's better to return a value directly than having out
parameter. It's cleaner and saves from some kinds of ugly bugs.
May also be faster.

Return next request from io_req_find_next() and friends directly
instead of passing out parameter.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 36 ++++++++++++++++++------------------
 1 file changed, 18 insertions(+), 18 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index a0aea78162a6..0234dc2c9625 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -1578,7 +1578,7 @@ out:
 		io_cqring_ev_posted(ctx);
 }
 
-static void io_req_link_next(struct io_kiocb *req, struct io_kiocb **nxtptr)
+static struct io_kiocb *io_req_link_next(struct io_kiocb *req)
 {
 	struct io_kiocb *nxt;
 
@@ -1588,13 +1588,13 @@ static void io_req_link_next(struct io_kiocb *req, struct io_kiocb **nxtptr)
 	 * safe side.
 	 */
 	if (unlikely(list_empty(&req->link_list)))
-		return;
+		return NULL;
 
 	nxt = list_first_entry(&req->link_list, struct io_kiocb, link_list);
 	list_del_init(&req->link_list);
 	if (!list_empty(&nxt->link_list))
 		nxt->flags |= REQ_F_LINK_HEAD;
-	*nxtptr = nxt;
+	return nxt;
 }
 
 /*
@@ -1637,10 +1637,10 @@ static void io_fail_links(struct io_kiocb *req)
 	io_cqring_ev_posted(ctx);
 }
 
-static void io_req_find_next(struct io_kiocb *req, struct io_kiocb **nxt)
+static struct io_kiocb *io_req_find_next(struct io_kiocb *req)
 {
 	if (likely(!(req->flags & REQ_F_LINK_HEAD)))
-		return;
+		return NULL;
 	req->flags &= ~REQ_F_LINK_HEAD;
 
 	if (req->flags & REQ_F_LINK_TIMEOUT)
@@ -1652,10 +1652,10 @@ static void io_req_find_next(struct io_kiocb *req, struct io_kiocb **nxt)
 	 * dependencies to the next request. In case of failure, fail the rest
 	 * of the chain.
 	 */
-	if (req->flags & REQ_F_FAIL_LINK)
-		io_fail_links(req);
-	else
-		io_req_link_next(req, nxt);
+	if (likely(!(req->flags & REQ_F_FAIL_LINK)))
+		return io_req_link_next(req);
+	io_fail_links(req);
+	return NULL;
 }
 
 static void __io_req_task_cancel(struct io_kiocb *req, int error)
@@ -1718,9 +1718,8 @@ static void io_req_task_queue(struct io_kiocb *req)
 
 static void io_queue_next(struct io_kiocb *req)
 {
-	struct io_kiocb *nxt = NULL;
+	struct io_kiocb *nxt = io_req_find_next(req);
 
-	io_req_find_next(req, &nxt);
 	if (nxt)
 		io_req_task_queue(nxt);
 }
@@ -1770,13 +1769,15 @@ static void io_req_free_batch(struct req_batch *rb, struct io_kiocb *req)
  * Drop reference to request, return next in chain (if there is one) if this
  * was the last reference to this request.
  */
-__attribute__((nonnull))
-static void io_put_req_find_next(struct io_kiocb *req, struct io_kiocb **nxtptr)
+static struct io_kiocb *io_put_req_find_next(struct io_kiocb *req)
 {
+	struct io_kiocb *nxt = NULL;
+
 	if (refcount_dec_and_test(&req->refs)) {
-		io_req_find_next(req, nxtptr);
+		nxt = io_req_find_next(req);
 		__io_free_req(req);
 	}
+	return nxt;
 }
 
 static void io_put_req(struct io_kiocb *req)
@@ -1797,7 +1798,7 @@ static struct io_wq_work *io_steal_work(struct io_kiocb *req)
 	if (refcount_read(&req->refs) != 1)
 		return NULL;
 
-	io_req_find_next(req, &nxt);
+	nxt = io_req_find_next(req);
 	if (!nxt)
 		return NULL;
 
@@ -4488,7 +4489,7 @@ static void io_poll_task_handler(struct io_kiocb *req, struct io_kiocb **nxt)
 	hash_del(&req->hash_node);
 	io_poll_complete(req, req->result, 0);
 	req->flags |= REQ_F_COMP_LOCKED;
-	io_put_req_find_next(req, nxt);
+	*nxt = io_put_req_find_next(req);
 	spin_unlock_irq(&ctx->completion_lock);
 
 	io_cqring_ev_posted(ctx);
@@ -5938,9 +5939,8 @@ punt:
 	}
 
 err:
-	nxt = NULL;
 	/* drop submission reference */
-	io_put_req_find_next(req, &nxt);
+	nxt = io_put_req_find_next(req);
 
 	if (linked_timeout) {
 		if (!ret)

From a1a4661691c5f1a3af4c04f56ad68e2d1dbee3af Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Mon, 29 Jun 2020 13:13:01 +0300
Subject: [PATCH 216/502] io_uring: kill REQ_F_TIMEOUT

Now REQ_F_TIMEOUT is set but never used, kill it

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 0234dc2c9625..e9c8f52daf8f 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -531,7 +531,6 @@ enum {
 	REQ_F_CUR_POS_BIT,
 	REQ_F_NOWAIT_BIT,
 	REQ_F_LINK_TIMEOUT_BIT,
-	REQ_F_TIMEOUT_BIT,
 	REQ_F_ISREG_BIT,
 	REQ_F_TIMEOUT_NOSEQ_BIT,
 	REQ_F_COMP_LOCKED_BIT,
@@ -574,8 +573,6 @@ enum {
 	REQ_F_NOWAIT		= BIT(REQ_F_NOWAIT_BIT),
 	/* has linked timeout */
 	REQ_F_LINK_TIMEOUT	= BIT(REQ_F_LINK_TIMEOUT_BIT),
-	/* timeout request */
-	REQ_F_TIMEOUT		= BIT(REQ_F_TIMEOUT_BIT),
 	/* regular file */
 	REQ_F_ISREG		= BIT(REQ_F_ISREG_BIT),
 	/* no timeout sequence */
@@ -5063,7 +5060,6 @@ static int io_timeout_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe,
 
 	data = &req->io->timeout;
 	data->req = req;
-	req->flags |= REQ_F_TIMEOUT;
 
 	if (get_timespec64(&data->ts, u64_to_user_ptr(sqe->addr)))
 		return -EFAULT;

From 8eb7e2d00763367f345ef0b2a2eb4f8001ae40ce Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Mon, 29 Jun 2020 13:13:02 +0300
Subject: [PATCH 217/502] io_uring: kill REQ_F_TIMEOUT_NOSEQ

There are too many useless flags, kill REQ_F_TIMEOUT_NOSEQ, which can be
easily infered from req.timeout itself.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index e9c8f52daf8f..8495c17b53d6 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -532,7 +532,6 @@ enum {
 	REQ_F_NOWAIT_BIT,
 	REQ_F_LINK_TIMEOUT_BIT,
 	REQ_F_ISREG_BIT,
-	REQ_F_TIMEOUT_NOSEQ_BIT,
 	REQ_F_COMP_LOCKED_BIT,
 	REQ_F_NEED_CLEANUP_BIT,
 	REQ_F_OVERFLOW_BIT,
@@ -575,8 +574,6 @@ enum {
 	REQ_F_LINK_TIMEOUT	= BIT(REQ_F_LINK_TIMEOUT_BIT),
 	/* regular file */
 	REQ_F_ISREG		= BIT(REQ_F_ISREG_BIT),
-	/* no timeout sequence */
-	REQ_F_TIMEOUT_NOSEQ	= BIT(REQ_F_TIMEOUT_NOSEQ_BIT),
 	/* completion under lock */
 	REQ_F_COMP_LOCKED	= BIT(REQ_F_COMP_LOCKED_BIT),
 	/* needs cleanup */
@@ -1010,6 +1007,11 @@ static void io_ring_ctx_ref_free(struct percpu_ref *ref)
 	complete(&ctx->ref_comp);
 }
 
+static inline bool io_is_timeout_noseq(struct io_kiocb *req)
+{
+	return !req->timeout.off;
+}
+
 static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
 {
 	struct io_ring_ctx *ctx;
@@ -1222,7 +1224,7 @@ static void io_flush_timeouts(struct io_ring_ctx *ctx)
 		struct io_kiocb *req = list_first_entry(&ctx->timeout_list,
 							struct io_kiocb, list);
 
-		if (req->flags & REQ_F_TIMEOUT_NOSEQ)
+		if (io_is_timeout_noseq(req))
 			break;
 		if (req->timeout.target_seq != ctx->cached_cq_tail
 					- atomic_read(&ctx->cq_timeouts))
@@ -5087,8 +5089,7 @@ static int io_timeout(struct io_kiocb *req)
 	 * timeout event to be satisfied. If it isn't set, then this is
 	 * a pure timeout request, sequence isn't used.
 	 */
-	if (!off) {
-		req->flags |= REQ_F_TIMEOUT_NOSEQ;
+	if (io_is_timeout_noseq(req)) {
 		entry = ctx->timeout_list.prev;
 		goto add;
 	}
@@ -5103,7 +5104,7 @@ static int io_timeout(struct io_kiocb *req)
 	list_for_each_prev(entry, &ctx->timeout_list) {
 		struct io_kiocb *nxt = list_entry(entry, struct io_kiocb, list);
 
-		if (nxt->flags & REQ_F_TIMEOUT_NOSEQ)
+		if (io_is_timeout_noseq(nxt))
 			continue;
 		/* nxt.seq is behind @tail, otherwise would've been completed */
 		if (off >= nxt->timeout.target_seq - tail)

From ecfc51777487da4da530710e0b13de4c8cb4a6d2 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Mon, 29 Jun 2020 13:13:03 +0300
Subject: [PATCH 218/502] io_uring: fix potential use after free on fallback
 request free

After __io_free_req() puts a ctx ref, it should be assumed that the ctx
may already be gone. However, it can be accessed when putting the
fallback req. Free the req first and then put the ctx.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 8495c17b53d6..b54e358e6b31 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -1526,12 +1526,15 @@ static void io_dismantle_req(struct io_kiocb *req)
 
 static void __io_free_req(struct io_kiocb *req)
 {
+	struct io_ring_ctx *ctx;
+
 	io_dismantle_req(req);
-	percpu_ref_put(&req->ctx->refs);
+	ctx = req->ctx;
 	if (likely(!io_is_fallback_req(req)))
 		kmem_cache_free(req_cachep, req);
 	else
-		clear_bit_unlock(0, (unsigned long *) &req->ctx->fallback_req);
+		clear_bit_unlock(0, (unsigned long *) &ctx->fallback_req);
+	percpu_ref_put(&ctx->refs);
 }
 
 static bool io_link_cancel_timeout(struct io_kiocb *req)

From 351fd53595a3ceb88756a005e3b864f7c8cb86e4 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Mon, 29 Jun 2020 19:18:40 +0300
Subject: [PATCH 219/502] io_uring: don't pass def into io_req_work_grab_env

Remove struct io_op_def *def parameter from io_req_work_grab_env(),
it's trivially deducible from req->opcode and fast. The API is
cleaner this way, and also helps the complier to understand
that it's a real constant and could be register-cached.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index b54e358e6b31..2b7666e81c13 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -1101,9 +1101,10 @@ static void __io_commit_cqring(struct io_ring_ctx *ctx)
 	}
 }
 
-static inline void io_req_work_grab_env(struct io_kiocb *req,
-					const struct io_op_def *def)
+static inline void io_req_work_grab_env(struct io_kiocb *req)
 {
+	const struct io_op_def *def = &io_op_defs[req->opcode];
+
 	if (!req->work.mm && def->needs_mm) {
 		mmgrab(current->mm);
 		req->work.mm = current->mm;
@@ -1161,7 +1162,7 @@ static inline void io_prep_async_work(struct io_kiocb *req,
 	}
 
 	io_req_init_async(req);
-	io_req_work_grab_env(req, def);
+	io_req_work_grab_env(req);
 
 	*link = io_prep_linked_timeout(req);
 }
@@ -5255,7 +5256,7 @@ static int io_req_defer_prep(struct io_kiocb *req,
 
 	if (for_async || (req->flags & REQ_F_WORK_INITIALIZED)) {
 		io_req_init_async(req);
-		io_req_work_grab_env(req, &io_op_defs[req->opcode]);
+		io_req_work_grab_env(req);
 	}
 
 	switch (req->opcode) {

From edcdfcc149a8d0c11d4dd2b23b5338af22e31a5f Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Mon, 29 Jun 2020 19:18:41 +0300
Subject: [PATCH 220/502] io_uring: do init work in grab_env()

Place io_req_init_async() in io_req_work_grab_env() so it won't be
forgotten.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 2b7666e81c13..3b2f6fd8f58f 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -1105,6 +1105,8 @@ static inline void io_req_work_grab_env(struct io_kiocb *req)
 {
 	const struct io_op_def *def = &io_op_defs[req->opcode];
 
+	io_req_init_async(req);
+
 	if (!req->work.mm && def->needs_mm) {
 		mmgrab(current->mm);
 		req->work.mm = current->mm;
@@ -1161,9 +1163,7 @@ static inline void io_prep_async_work(struct io_kiocb *req,
 			req->work.flags |= IO_WQ_WORK_UNBOUND;
 	}
 
-	io_req_init_async(req);
 	io_req_work_grab_env(req);
-
 	*link = io_prep_linked_timeout(req);
 }
 
@@ -5254,10 +5254,8 @@ static int io_req_defer_prep(struct io_kiocb *req,
 			return ret;
 	}
 
-	if (for_async || (req->flags & REQ_F_WORK_INITIALIZED)) {
-		io_req_init_async(req);
+	if (for_async || (req->flags & REQ_F_WORK_INITIALIZED))
 		io_req_work_grab_env(req);
-	}
 
 	switch (req->opcode) {
 	case IORING_OP_NOP:

From debb85f496c9cc70663eac31d3ad9153839c844c Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Mon, 29 Jun 2020 19:18:42 +0300
Subject: [PATCH 221/502] io_uring: factor out grab_env() from defer_prep()

Remove io_req_work_grab_env() call from io_req_defer_prep(), just call
it when neccessary.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 15 +++++++--------
 1 file changed, 7 insertions(+), 8 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 3b2f6fd8f58f..caf908382cdb 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -5240,7 +5240,7 @@ static int io_files_update(struct io_kiocb *req, bool force_nonblock,
 }
 
 static int io_req_defer_prep(struct io_kiocb *req,
-			     const struct io_uring_sqe *sqe, bool for_async)
+			     const struct io_uring_sqe *sqe)
 {
 	ssize_t ret = 0;
 
@@ -5254,9 +5254,6 @@ static int io_req_defer_prep(struct io_kiocb *req,
 			return ret;
 	}
 
-	if (for_async || (req->flags & REQ_F_WORK_INITIALIZED))
-		io_req_work_grab_env(req);
-
 	switch (req->opcode) {
 	case IORING_OP_NOP:
 		break;
@@ -5369,9 +5366,10 @@ static int io_req_defer(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 	if (!req->io) {
 		if (io_alloc_async_ctx(req))
 			return -EAGAIN;
-		ret = io_req_defer_prep(req, sqe, true);
+		ret = io_req_defer_prep(req, sqe);
 		if (ret < 0)
 			return ret;
+		io_req_work_grab_env(req);
 	}
 
 	spin_lock_irq(&ctx->completion_lock);
@@ -5983,9 +5981,10 @@ fail_req:
 			ret = -EAGAIN;
 			if (io_alloc_async_ctx(req))
 				goto fail_req;
-			ret = io_req_defer_prep(req, sqe, true);
+			ret = io_req_defer_prep(req, sqe);
 			if (unlikely(ret < 0))
 				goto fail_req;
+			io_req_work_grab_env(req);
 		}
 
 		/*
@@ -6039,7 +6038,7 @@ static int io_submit_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe,
 		if (io_alloc_async_ctx(req))
 			return -EAGAIN;
 
-		ret = io_req_defer_prep(req, sqe, false);
+		ret = io_req_defer_prep(req, sqe);
 		if (ret) {
 			/* fail even hard links since we don't submit */
 			head->flags |= REQ_F_FAIL_LINK;
@@ -6066,7 +6065,7 @@ static int io_submit_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe,
 			if (io_alloc_async_ctx(req))
 				return -EAGAIN;
 
-			ret = io_req_defer_prep(req, sqe, false);
+			ret = io_req_defer_prep(req, sqe);
 			if (ret)
 				req->flags |= REQ_F_FAIL_LINK;
 			*link = req;

From cbdcb4357c000861b77369c34e110fa893d23607 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Mon, 29 Jun 2020 19:18:43 +0300
Subject: [PATCH 222/502] io_uring: do grab_env() just before punting

Currently io_steal_work() is disabled, and every linked request should
go through task_work for initialisation. Do io_req_work_grab_env()
just before io-wq punting and for the whole link, so any request
reachable by io_steal_work() is prepared.

This is also interesting for another reason -- it localises
io_req_work_grab_env() into one place just before io-wq punting, helping
to to better manage req->work lifetime and add some neat
cleanup/optimisations later.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 53 ++++++++++++++++++++++++++++-----------------------
 1 file changed, 29 insertions(+), 24 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index caf908382cdb..9bc4339057ef 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -1101,7 +1101,7 @@ static void __io_commit_cqring(struct io_ring_ctx *ctx)
 	}
 }
 
-static inline void io_req_work_grab_env(struct io_kiocb *req)
+static void io_req_work_grab_env(struct io_kiocb *req)
 {
 	const struct io_op_def *def = &io_op_defs[req->opcode];
 
@@ -1150,8 +1150,7 @@ static inline void io_req_work_drop_env(struct io_kiocb *req)
 	}
 }
 
-static inline void io_prep_async_work(struct io_kiocb *req,
-				      struct io_kiocb **link)
+static void io_prep_async_work(struct io_kiocb *req)
 {
 	const struct io_op_def *def = &io_op_defs[req->opcode];
 
@@ -1164,15 +1163,22 @@ static inline void io_prep_async_work(struct io_kiocb *req,
 	}
 
 	io_req_work_grab_env(req);
-	*link = io_prep_linked_timeout(req);
 }
 
-static inline void io_queue_async_work(struct io_kiocb *req)
+static void io_prep_async_link(struct io_kiocb *req)
+{
+	struct io_kiocb *cur;
+
+	io_prep_async_work(req);
+	if (req->flags & REQ_F_LINK_HEAD)
+		list_for_each_entry(cur, &req->link_list, link_list)
+			io_prep_async_work(cur);
+}
+
+static void __io_queue_async_work(struct io_kiocb *req)
 {
 	struct io_ring_ctx *ctx = req->ctx;
-	struct io_kiocb *link;
-
-	io_prep_async_work(req, &link);
+	struct io_kiocb *link = io_prep_linked_timeout(req);
 
 	trace_io_uring_queue_async_work(ctx, io_wq_is_hashed(&req->work), req,
 					&req->work, req->flags);
@@ -1182,6 +1188,13 @@ static inline void io_queue_async_work(struct io_kiocb *req)
 		io_queue_linked_timeout(link);
 }
 
+static void io_queue_async_work(struct io_kiocb *req)
+{
+	/* init ->work of the whole link before punting */
+	io_prep_async_link(req);
+	__io_queue_async_work(req);
+}
+
 static void io_kill_timeout(struct io_kiocb *req)
 {
 	int ret;
@@ -1215,7 +1228,8 @@ static void __io_queue_deferred(struct io_ring_ctx *ctx)
 		if (req_need_defer(req))
 			break;
 		list_del_init(&req->list);
-		io_queue_async_work(req);
+		/* punt-init is done before queueing for defer */
+		__io_queue_async_work(req);
 	} while (!list_empty(&ctx->defer_list));
 }
 
@@ -1791,7 +1805,7 @@ static void io_put_req(struct io_kiocb *req)
 
 static struct io_wq_work *io_steal_work(struct io_kiocb *req)
 {
-	struct io_kiocb *nxt = NULL;
+	struct io_kiocb *timeout, *nxt = NULL;
 
 	/*
 	 * A ref is owned by io-wq in which context we're. So, if that's the
@@ -1805,18 +1819,10 @@ static struct io_wq_work *io_steal_work(struct io_kiocb *req)
 	if (!nxt)
 		return NULL;
 
-	if ((nxt->flags & REQ_F_ISREG) && io_op_defs[nxt->opcode].hash_reg_file)
-		io_wq_hash_work(&nxt->work, file_inode(nxt->file));
-
-	io_req_task_queue(nxt);
-	/*
-	 * If we're going to return actual work, here should be timeout prep:
-	 *
-	 * link = io_prep_linked_timeout(nxt);
-	 * if (link)
-	 *	nxt->flags |= REQ_F_QUEUE_TIMEOUT;
-	 */
-	return NULL;
+	timeout = io_prep_linked_timeout(nxt);
+	if (timeout)
+		nxt->flags |= REQ_F_QUEUE_TIMEOUT;
+	return &nxt->work;
 }
 
 /*
@@ -5369,8 +5375,8 @@ static int io_req_defer(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 		ret = io_req_defer_prep(req, sqe);
 		if (ret < 0)
 			return ret;
-		io_req_work_grab_env(req);
 	}
+	io_prep_async_link(req);
 
 	spin_lock_irq(&ctx->completion_lock);
 	if (!req_need_defer(req) && list_empty(&ctx->defer_list)) {
@@ -5984,7 +5990,6 @@ fail_req:
 			ret = io_req_defer_prep(req, sqe);
 			if (unlikely(ret < 0))
 				goto fail_req;
-			io_req_work_grab_env(req);
 		}
 
 		/*

From ab0b6451db2a8ed630b89ef3826b8ea994149444 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Tue, 30 Jun 2020 08:43:15 -0600
Subject: [PATCH 223/502] io_uring: clean up io_kill_linked_timeout() locking

Avoid jumping through hoops to silence unused variable warnings, and
also fix sparse rightfully complaining about the locking context:

fs/io_uring.c:1593:39: warning: context imbalance in 'io_kill_linked_timeout' - unexpected unlock

Provide the functional helper as __io_kill_linked_timeout(), and have
separate the locking from it.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 32 +++++++++++++++++++++-----------
 1 file changed, 21 insertions(+), 11 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 9bc4339057ef..3c12221f549e 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -1569,28 +1569,38 @@ static bool io_link_cancel_timeout(struct io_kiocb *req)
 	return false;
 }
 
-static void io_kill_linked_timeout(struct io_kiocb *req)
+static bool __io_kill_linked_timeout(struct io_kiocb *req)
 {
-	struct io_ring_ctx *ctx = req->ctx;
 	struct io_kiocb *link;
-	bool wake_ev = false;
-	unsigned long flags = 0; /* false positive warning */
-
-	if (!(req->flags & REQ_F_COMP_LOCKED))
-		spin_lock_irqsave(&ctx->completion_lock, flags);
+	bool wake_ev;
 
 	if (list_empty(&req->link_list))
-		goto out;
+		return false;
 	link = list_first_entry(&req->link_list, struct io_kiocb, link_list);
 	if (link->opcode != IORING_OP_LINK_TIMEOUT)
-		goto out;
+		return false;
 
 	list_del_init(&link->link_list);
 	wake_ev = io_link_cancel_timeout(link);
 	req->flags &= ~REQ_F_LINK_TIMEOUT;
-out:
-	if (!(req->flags & REQ_F_COMP_LOCKED))
+	return wake_ev;
+}
+
+static void io_kill_linked_timeout(struct io_kiocb *req)
+{
+	struct io_ring_ctx *ctx = req->ctx;
+	bool wake_ev;
+
+	if (!(req->flags & REQ_F_COMP_LOCKED)) {
+		unsigned long flags;
+
+		spin_lock_irqsave(&ctx->completion_lock, flags);
+		wake_ev = __io_kill_linked_timeout(req);
 		spin_unlock_irqrestore(&ctx->completion_lock, flags);
+	} else {
+		wake_ev = __io_kill_linked_timeout(req);
+	}
+
 	if (wake_ev)
 		io_cqring_ev_posted(ctx);
 }

From cf2f54255d0342cfbd273cbb964ad6bc7674f587 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Tue, 30 Jun 2020 15:20:40 +0300
Subject: [PATCH 224/502] io_uring: don't fail iopoll requeue without ->mm

Actually, io_iopoll_queue() may have NULL ->mm, that's if SQ thread
didn't grabbed mm before doing iopoll. Don't fail reqs there, as after
recent changes it won't be punted directly but rather through task_work.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 3c12221f549e..43419f5bef8c 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -1902,9 +1902,7 @@ static void io_iopoll_queue(struct list_head *again)
 	do {
 		req = list_first_entry(again, struct io_kiocb, list);
 		list_del(&req->list);
-
-		/* should have ->mm unless io_uring is dying, kill reqs then */
-		if (unlikely(!current->mm) || !io_rw_reissue(req, -EAGAIN))
+		if (!io_rw_reissue(req, -EAGAIN))
 			io_complete_rw_common(&req->rw.kiocb, -EAGAIN, NULL);
 	} while (!list_empty(again));
 }

From ea1164e574e9af0a15ab730ead0861a4c7724142 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Tue, 30 Jun 2020 15:20:41 +0300
Subject: [PATCH 225/502] io_uring: fix NULL mm in io_poll_task_func()

io_poll_task_func() hand-coded link submission forgetting to set
TASK_RUNNING, acquire mm, etc. Call existing helper for that,
i.e. __io_req_task_submit().

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 9 ++-------
 1 file changed, 2 insertions(+), 7 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 43419f5bef8c..2c17c2613205 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -4518,13 +4518,8 @@ static void io_poll_task_func(struct callback_head *cb)
 	struct io_kiocb *nxt = NULL;
 
 	io_poll_task_handler(req, &nxt);
-	if (nxt) {
-		struct io_ring_ctx *ctx = nxt->ctx;
-
-		mutex_lock(&ctx->uring_lock);
-		__io_queue_sqe(nxt, NULL, NULL);
-		mutex_unlock(&ctx->uring_lock);
-	}
+	if (nxt)
+		__io_req_task_submit(nxt);
 }
 
 static int io_poll_double_wake(struct wait_queue_entry *wait, unsigned mode,

From 0be0b0e33b0bfd08264b108512e44b3907fe987b Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Tue, 30 Jun 2020 15:20:42 +0300
Subject: [PATCH 226/502] io_uring: simplify io_async_task_func()

Greatly simplify io_async_task_func() removing duplicated functionality
of __io_req_task_submit(). This do one extra spin lock/unlock for
cancelled poll case, but that shouldn't happen often.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 29 +++++------------------------
 1 file changed, 5 insertions(+), 24 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 2c17c2613205..82b35948ac5b 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -4608,7 +4608,6 @@ static void io_async_task_func(struct callback_head *cb)
 	struct io_kiocb *req = container_of(cb, struct io_kiocb, task_work);
 	struct async_poll *apoll = req->apoll;
 	struct io_ring_ctx *ctx = req->ctx;
-	bool canceled = false;
 
 	trace_io_uring_task_run(req->ctx, req->opcode, req->user_data);
 
@@ -4618,15 +4617,8 @@ static void io_async_task_func(struct callback_head *cb)
 	}
 
 	/* If req is still hashed, it cannot have been canceled. Don't check. */
-	if (hash_hashed(&req->hash_node)) {
+	if (hash_hashed(&req->hash_node))
 		hash_del(&req->hash_node);
-	} else {
-		canceled = READ_ONCE(apoll->poll.canceled);
-		if (canceled) {
-			io_cqring_fill_event(req, -ECANCELED);
-			io_commit_cqring(ctx);
-		}
-	}
 
 	spin_unlock_irq(&ctx->completion_lock);
 
@@ -4635,21 +4627,10 @@ static void io_async_task_func(struct callback_head *cb)
 		memcpy(&req->work, &apoll->work, sizeof(req->work));
 	kfree(apoll);
 
-	if (!canceled) {
-		__set_current_state(TASK_RUNNING);
-		if (io_sq_thread_acquire_mm(ctx, req)) {
-			io_cqring_add_event(req, -EFAULT, 0);
-			goto end_req;
-		}
-		mutex_lock(&ctx->uring_lock);
-		__io_queue_sqe(req, NULL, NULL);
-		mutex_unlock(&ctx->uring_lock);
-	} else {
-		io_cqring_ev_posted(ctx);
-end_req:
-		req_set_fail_links(req);
-		io_double_put_req(req);
-	}
+	if (!READ_ONCE(apoll->poll.canceled))
+		__io_req_task_submit(req);
+	else
+		__io_req_task_cancel(req, -ECANCELED);
 }
 
 static int io_async_wake(struct wait_queue_entry *wait, unsigned mode, int sync,

From 3fa5e0f331280237af918ab2e7a160f5a68d3e7d Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Tue, 30 Jun 2020 15:20:43 +0300
Subject: [PATCH 227/502] io_uring: optimise io_req_find_next() fast check

gcc 9.2.0 compiles io_req_find_next() as a separate function leaving
the first REQ_F_LINK_HEAD fast check not inlined. Help it by splitting
out the check from the function.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 82b35948ac5b..9a43847c6823 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -1664,12 +1664,9 @@ static void io_fail_links(struct io_kiocb *req)
 	io_cqring_ev_posted(ctx);
 }
 
-static struct io_kiocb *io_req_find_next(struct io_kiocb *req)
+static struct io_kiocb *__io_req_find_next(struct io_kiocb *req)
 {
-	if (likely(!(req->flags & REQ_F_LINK_HEAD)))
-		return NULL;
 	req->flags &= ~REQ_F_LINK_HEAD;
-
 	if (req->flags & REQ_F_LINK_TIMEOUT)
 		io_kill_linked_timeout(req);
 
@@ -1685,6 +1682,13 @@ static struct io_kiocb *io_req_find_next(struct io_kiocb *req)
 	return NULL;
 }
 
+static struct io_kiocb *io_req_find_next(struct io_kiocb *req)
+{
+	if (likely(!(req->flags & REQ_F_LINK_HEAD)))
+		return NULL;
+	return __io_req_find_next(req);
+}
+
 static void __io_req_task_cancel(struct io_kiocb *req, int error)
 {
 	struct io_ring_ctx *ctx = req->ctx;

From 8eb06d7e8dd853d70668617dda57de4f6cebe651 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Tue, 30 Jun 2020 15:20:39 +0300
Subject: [PATCH 228/502] io_uring: fix missing ->mm on exit

There is a fancy bug, where exiting user task may not have ->mm,
that makes task_works to try to do kthread_use_mm(ctx->sqo_mm).

Don't do that if sqo_mm is NULL.

[  290.460558] WARNING: CPU: 6 PID: 150933 at kernel/kthread.c:1238
	kthread_use_mm+0xf3/0x110
[  290.460579] CPU: 6 PID: 150933 Comm: read-write2 Tainted: G
	I E     5.8.0-rc2-00066-g9b21720607cf #531
[  290.460580] RIP: 0010:kthread_use_mm+0xf3/0x110
...
[  290.460584] Call Trace:
[  290.460584]  __io_sq_thread_acquire_mm.isra.0.part.0+0x25/0x30
[  290.460584]  __io_req_task_submit+0x64/0x80
[  290.460584]  io_req_task_submit+0x15/0x20
[  290.460585]  task_work_run+0x67/0xa0
[  290.460585]  do_exit+0x35d/0xb70
[  290.460585]  do_group_exit+0x43/0xa0
[  290.460585]  get_signal+0x140/0x900
[  290.460586]  do_signal+0x37/0x780
[  290.460586]  __prepare_exit_to_usermode+0x126/0x1c0
[  290.460586]  __syscall_return_slowpath+0x3b/0x1c0
[  290.460587]  do_syscall_64+0x5f/0xa0
[  290.460587]  entry_SYSCALL_64_after_hwframe+0x44/0xa9

following with faults.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 9a43847c6823..cfad2acd4d86 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -958,7 +958,7 @@ static void io_sq_thread_drop_mm(struct io_ring_ctx *ctx)
 static int __io_sq_thread_acquire_mm(struct io_ring_ctx *ctx)
 {
 	if (!current->mm) {
-		if (unlikely(!mmget_not_zero(ctx->sqo_mm)))
+		if (unlikely(!ctx->sqo_mm || !mmget_not_zero(ctx->sqo_mm)))
 			return -EFAULT;
 		kthread_use_mm(ctx->sqo_mm);
 	}
@@ -7216,10 +7216,10 @@ static int io_sq_offload_start(struct io_ring_ctx *ctx,
 {
 	int ret;
 
-	mmgrab(current->mm);
-	ctx->sqo_mm = current->mm;
-
 	if (ctx->flags & IORING_SETUP_SQPOLL) {
+		mmgrab(current->mm);
+		ctx->sqo_mm = current->mm;
+
 		ret = -EPERM;
 		if (!capable(CAP_SYS_ADMIN))
 			goto err;
@@ -7263,8 +7263,10 @@ static int io_sq_offload_start(struct io_ring_ctx *ctx,
 	return 0;
 err:
 	io_finish_async(ctx);
-	mmdrop(ctx->sqo_mm);
-	ctx->sqo_mm = NULL;
+	if (ctx->sqo_mm) {
+		mmdrop(ctx->sqo_mm);
+		ctx->sqo_mm = NULL;
+	}
 	return ret;
 }
 

From fb37409a01b011a664347702f44dbf13fa7c7486 Mon Sep 17 00:00:00 2001
From: Mike Rapoport <rppt@linux.ibm.com>
Date: Wed, 10 Jun 2020 09:45:20 +0300
Subject: [PATCH 229/502] arch: remove unicore32 port

The unicore32 port do not seem maintained for a long time now, there is no
upstream toolchain that can create unicore32 binaries and all the links to
prebuilt toolchains for unicore32 are dead. Even compilers that were
available are not supported by the kernel anymore.

Guenter Roeck says:

  I have stopped building unicore32 images since v4.19 since there is no
  available compiler that is still supported by the kernel. I am surprised
  that support for it has not been removed from the kernel.

Remove unicore32 port.

Signed-off-by: Mike Rapoport <rppt@linux.ibm.com>
Acked-by: Arnd Bergmann <arnd@arndb.de>
Acked-by: Guenter Roeck <linux@roeck-us.net>
---
 .../features/core/cBPF-JIT/arch-support.txt   |   1 -
 .../features/core/eBPF-JIT/arch-support.txt   |   1 -
 .../core/generic-idle-thread/arch-support.txt |   1 -
 .../core/jump-labels/arch-support.txt         |   1 -
 .../features/core/tracehook/arch-support.txt  |   1 -
 .../features/debug/KASAN/arch-support.txt     |   1 -
 .../debug/debug-vm-pgtable/arch-support.txt   |   1 -
 .../debug/gcov-profile-all/arch-support.txt   |   1 -
 .../features/debug/kgdb/arch-support.txt      |   1 -
 .../debug/kprobes-on-ftrace/arch-support.txt  |   1 -
 .../features/debug/kprobes/arch-support.txt   |   1 -
 .../debug/kretprobes/arch-support.txt         |   1 -
 .../features/debug/optprobes/arch-support.txt |   1 -
 .../debug/stackprotector/arch-support.txt     |   1 -
 .../features/debug/uprobes/arch-support.txt   |   1 -
 .../debug/user-ret-profiler/arch-support.txt  |   1 -
 .../io/dma-contiguous/arch-support.txt        |   1 -
 .../locking/cmpxchg-local/arch-support.txt    |   1 -
 .../features/locking/lockdep/arch-support.txt |   1 -
 .../locking/queued-rwlocks/arch-support.txt   |   1 -
 .../locking/queued-spinlocks/arch-support.txt |   1 -
 .../perf/kprobes-event/arch-support.txt       |   1 -
 .../features/perf/perf-regs/arch-support.txt  |   1 -
 .../perf/perf-stackdump/arch-support.txt      |   1 -
 .../membarrier-sync-core/arch-support.txt     |   1 -
 .../sched/numa-balancing/arch-support.txt     |   1 -
 .../seccomp/seccomp-filter/arch-support.txt   |   1 -
 .../time/arch-tick-broadcast/arch-support.txt |   1 -
 .../time/clockevents/arch-support.txt         |   1 -
 .../time/context-tracking/arch-support.txt    |   1 -
 .../time/irq-time-acct/arch-support.txt       |   1 -
 .../time/modern-timekeeping/arch-support.txt  |   1 -
 .../time/virt-cpuacct/arch-support.txt        |   1 -
 .../features/vm/ELF-ASLR/arch-support.txt     |   1 -
 .../features/vm/PG_uncached/arch-support.txt  |   1 -
 .../features/vm/THP/arch-support.txt          |   1 -
 .../features/vm/TLB/arch-support.txt          |   1 -
 .../features/vm/huge-vmap/arch-support.txt    |   1 -
 .../features/vm/ioremap_prot/arch-support.txt |   1 -
 .../features/vm/pte_special/arch-support.txt  |   1 -
 MAINTAINERS                                   |   7 -
 arch/unicore32/.gitignore                     |  22 -
 arch/unicore32/Kconfig                        | 200 -----
 arch/unicore32/Kconfig.debug                  |  29 -
 arch/unicore32/Makefile                       |  59 --
 arch/unicore32/boot/Makefile                  |  39 -
 arch/unicore32/boot/compressed/Makefile       |  64 --
 arch/unicore32/boot/compressed/head.S         | 201 -----
 arch/unicore32/boot/compressed/misc.c         | 123 ---
 arch/unicore32/boot/compressed/piggy.S.in     |   6 -
 arch/unicore32/boot/compressed/vmlinux.lds.S  |  58 --
 arch/unicore32/configs/defconfig              | 214 -----
 arch/unicore32/include/asm/Kbuild             |   7 -
 arch/unicore32/include/asm/assembler.h        | 128 ---
 arch/unicore32/include/asm/barrier.h          |  16 -
 arch/unicore32/include/asm/bitops.h           |  46 -
 arch/unicore32/include/asm/bug.h              |  20 -
 arch/unicore32/include/asm/cache.h            |  24 -
 arch/unicore32/include/asm/cacheflush.h       | 186 ----
 arch/unicore32/include/asm/checksum.h         |  38 -
 arch/unicore32/include/asm/cmpxchg.h          |  58 --
 arch/unicore32/include/asm/cpu-single.h       |  42 -
 arch/unicore32/include/asm/cputype.h          |  30 -
 arch/unicore32/include/asm/delay.h            |  49 --
 arch/unicore32/include/asm/dma.h              |  20 -
 arch/unicore32/include/asm/elf.h              |  90 --
 arch/unicore32/include/asm/fpstate.h          |  23 -
 arch/unicore32/include/asm/fpu-ucf64.h        |  50 --
 arch/unicore32/include/asm/gpio.h             | 101 ---
 arch/unicore32/include/asm/hwcap.h            |  29 -
 arch/unicore32/include/asm/hwdef-copro.h      |  45 -
 arch/unicore32/include/asm/io.h               |  69 --
 arch/unicore32/include/asm/irq.h              | 102 ---
 arch/unicore32/include/asm/irqflags.h         |  50 --
 arch/unicore32/include/asm/linkage.h          |  19 -
 arch/unicore32/include/asm/memblock.h         |  43 -
 arch/unicore32/include/asm/memory.h           | 102 ---
 arch/unicore32/include/asm/mmu.h              |  14 -
 arch/unicore32/include/asm/mmu_context.h      |  98 ---
 arch/unicore32/include/asm/page.h             |  74 --
 arch/unicore32/include/asm/pci.h              |  20 -
 arch/unicore32/include/asm/pgalloc.h          |  87 --
 arch/unicore32/include/asm/pgtable-hwdef.h    |  51 --
 arch/unicore32/include/asm/pgtable.h          | 267 ------
 arch/unicore32/include/asm/processor.h        |  74 --
 arch/unicore32/include/asm/ptrace.h           |  58 --
 arch/unicore32/include/asm/stacktrace.h       |  28 -
 arch/unicore32/include/asm/string.h           |  35 -
 arch/unicore32/include/asm/suspend.h          |  26 -
 arch/unicore32/include/asm/switch_to.h        |  27 -
 arch/unicore32/include/asm/syscall.h          |  12 -
 arch/unicore32/include/asm/thread_info.h      | 133 ---
 arch/unicore32/include/asm/timex.h            |  31 -
 arch/unicore32/include/asm/tlb.h              |  24 -
 arch/unicore32/include/asm/tlbflush.h         | 192 -----
 arch/unicore32/include/asm/traps.h            |  18 -
 arch/unicore32/include/asm/uaccess.h          |  38 -
 arch/unicore32/include/asm/vmalloc.h          |   4 -
 arch/unicore32/include/mach/PKUnity.h         |  95 ---
 arch/unicore32/include/mach/bitfield.h        |  21 -
 arch/unicore32/include/mach/dma.h             |  45 -
 arch/unicore32/include/mach/hardware.h        |  30 -
 arch/unicore32/include/mach/map.h             |  17 -
 arch/unicore32/include/mach/memory.h          |  54 --
 arch/unicore32/include/mach/ocd.h             |  33 -
 arch/unicore32/include/mach/pm.h              |  37 -
 arch/unicore32/include/mach/regs-ac97.h       |  33 -
 arch/unicore32/include/mach/regs-dmac.h       |  82 --
 arch/unicore32/include/mach/regs-gpio.h       |  71 --
 arch/unicore32/include/mach/regs-i2c.h        |  64 --
 arch/unicore32/include/mach/regs-intc.h       |  29 -
 arch/unicore32/include/mach/regs-nand.h       |  80 --
 arch/unicore32/include/mach/regs-ost.h        |  91 --
 arch/unicore32/include/mach/regs-pci.h        |  95 ---
 arch/unicore32/include/mach/regs-pm.h         | 127 ---
 arch/unicore32/include/mach/regs-ps2.h        |  21 -
 arch/unicore32/include/mach/regs-resetc.h     |  35 -
 arch/unicore32/include/mach/regs-rtc.h        |  38 -
 arch/unicore32/include/mach/regs-sdc.h        | 157 ----
 arch/unicore32/include/mach/regs-spi.h        |  99 ---
 arch/unicore32/include/mach/regs-uart.h       |   3 -
 arch/unicore32/include/mach/regs-umal.h       | 230 -----
 arch/unicore32/include/mach/regs-unigfx.h     | 201 -----
 arch/unicore32/include/mach/uncompress.h      |  31 -
 arch/unicore32/include/uapi/asm/Kbuild        |   2 -
 arch/unicore32/include/uapi/asm/byteorder.h   |  25 -
 arch/unicore32/include/uapi/asm/ptrace.h      |  91 --
 arch/unicore32/include/uapi/asm/sigcontext.h  |  30 -
 arch/unicore32/include/uapi/asm/unistd.h      |  21 -
 arch/unicore32/kernel/Makefile                |  31 -
 arch/unicore32/kernel/asm-offsets.c           | 108 ---
 arch/unicore32/kernel/clock.c                 | 387 ---------
 arch/unicore32/kernel/debug-macro.S           |  86 --
 arch/unicore32/kernel/debug.S                 |  82 --
 arch/unicore32/kernel/dma.c                   | 179 ----
 arch/unicore32/kernel/early_printk.c          |  46 -
 arch/unicore32/kernel/elf.c                   |  35 -
 arch/unicore32/kernel/entry.S                 | 802 ------------------
 arch/unicore32/kernel/fpu-ucf64.c             | 117 ---
 arch/unicore32/kernel/gpio.c                  | 121 ---
 arch/unicore32/kernel/head.S                  | 249 ------
 arch/unicore32/kernel/hibernate.c             | 159 ----
 arch/unicore32/kernel/hibernate_asm.S         | 114 ---
 arch/unicore32/kernel/irq.c                   | 371 --------
 arch/unicore32/kernel/ksyms.c                 |  57 --
 arch/unicore32/kernel/ksyms.h                 |  14 -
 arch/unicore32/kernel/module.c                | 105 ---
 arch/unicore32/kernel/pci.c                   | 371 --------
 arch/unicore32/kernel/pm.c                    | 121 ---
 arch/unicore32/kernel/process.c               | 319 -------
 arch/unicore32/kernel/ptrace.c                | 147 ----
 arch/unicore32/kernel/puv3-core.c             | 276 ------
 arch/unicore32/kernel/puv3-nb0916.c           | 147 ----
 arch/unicore32/kernel/setup.c                 | 352 --------
 arch/unicore32/kernel/setup.h                 |  36 -
 arch/unicore32/kernel/signal.c                | 424 ---------
 arch/unicore32/kernel/sleep.S                 | 199 -----
 arch/unicore32/kernel/stacktrace.c            | 127 ---
 arch/unicore32/kernel/sys.c                   |  37 -
 arch/unicore32/kernel/time.c                  | 128 ---
 arch/unicore32/kernel/traps.c                 | 322 -------
 arch/unicore32/kernel/vmlinux.lds.S           |  59 --
 arch/unicore32/lib/Makefile                   |  28 -
 arch/unicore32/lib/backtrace.S                | 168 ----
 arch/unicore32/lib/clear_user.S               |  54 --
 arch/unicore32/lib/copy_from_user.S           | 101 ---
 arch/unicore32/lib/copy_page.S                |  36 -
 arch/unicore32/lib/copy_template.S            | 211 -----
 arch/unicore32/lib/copy_to_user.S             |  93 --
 arch/unicore32/lib/delay.S                    |  48 --
 arch/unicore32/lib/findbit.S                  |  97 ---
 arch/unicore32/lib/strncpy_from_user.S        |  42 -
 arch/unicore32/lib/strnlen_user.S             |  39 -
 arch/unicore32/mm/Kconfig                     |  41 -
 arch/unicore32/mm/Makefile                    |  14 -
 arch/unicore32/mm/alignment.c                 | 524 ------------
 arch/unicore32/mm/cache-ucv2.S                | 209 -----
 arch/unicore32/mm/extable.c                   |  21 -
 arch/unicore32/mm/fault.c                     | 481 -----------
 arch/unicore32/mm/flush.c                     |  94 --
 arch/unicore32/mm/init.c                      | 261 ------
 arch/unicore32/mm/ioremap.c                   | 242 ------
 arch/unicore32/mm/mm.h                        |  31 -
 arch/unicore32/mm/mmu.c                       | 513 -----------
 arch/unicore32/mm/pgd.c                       | 102 ---
 arch/unicore32/mm/proc-macros.S               | 142 ----
 arch/unicore32/mm/proc-syms.c                 |  19 -
 arch/unicore32/mm/proc-ucv2.S                 | 131 ---
 arch/unicore32/mm/tlb-ucv2.S                  |  86 --
 kernel/reboot.c                               |   2 +-
 190 files changed, 1 insertion(+), 15705 deletions(-)
 delete mode 100644 arch/unicore32/.gitignore
 delete mode 100644 arch/unicore32/Kconfig
 delete mode 100644 arch/unicore32/Kconfig.debug
 delete mode 100644 arch/unicore32/Makefile
 delete mode 100644 arch/unicore32/boot/Makefile
 delete mode 100644 arch/unicore32/boot/compressed/Makefile
 delete mode 100644 arch/unicore32/boot/compressed/head.S
 delete mode 100644 arch/unicore32/boot/compressed/misc.c
 delete mode 100644 arch/unicore32/boot/compressed/piggy.S.in
 delete mode 100644 arch/unicore32/boot/compressed/vmlinux.lds.S
 delete mode 100644 arch/unicore32/configs/defconfig
 delete mode 100644 arch/unicore32/include/asm/Kbuild
 delete mode 100644 arch/unicore32/include/asm/assembler.h
 delete mode 100644 arch/unicore32/include/asm/barrier.h
 delete mode 100644 arch/unicore32/include/asm/bitops.h
 delete mode 100644 arch/unicore32/include/asm/bug.h
 delete mode 100644 arch/unicore32/include/asm/cache.h
 delete mode 100644 arch/unicore32/include/asm/cacheflush.h
 delete mode 100644 arch/unicore32/include/asm/checksum.h
 delete mode 100644 arch/unicore32/include/asm/cmpxchg.h
 delete mode 100644 arch/unicore32/include/asm/cpu-single.h
 delete mode 100644 arch/unicore32/include/asm/cputype.h
 delete mode 100644 arch/unicore32/include/asm/delay.h
 delete mode 100644 arch/unicore32/include/asm/dma.h
 delete mode 100644 arch/unicore32/include/asm/elf.h
 delete mode 100644 arch/unicore32/include/asm/fpstate.h
 delete mode 100644 arch/unicore32/include/asm/fpu-ucf64.h
 delete mode 100644 arch/unicore32/include/asm/gpio.h
 delete mode 100644 arch/unicore32/include/asm/hwcap.h
 delete mode 100644 arch/unicore32/include/asm/hwdef-copro.h
 delete mode 100644 arch/unicore32/include/asm/io.h
 delete mode 100644 arch/unicore32/include/asm/irq.h
 delete mode 100644 arch/unicore32/include/asm/irqflags.h
 delete mode 100644 arch/unicore32/include/asm/linkage.h
 delete mode 100644 arch/unicore32/include/asm/memblock.h
 delete mode 100644 arch/unicore32/include/asm/memory.h
 delete mode 100644 arch/unicore32/include/asm/mmu.h
 delete mode 100644 arch/unicore32/include/asm/mmu_context.h
 delete mode 100644 arch/unicore32/include/asm/page.h
 delete mode 100644 arch/unicore32/include/asm/pci.h
 delete mode 100644 arch/unicore32/include/asm/pgalloc.h
 delete mode 100644 arch/unicore32/include/asm/pgtable-hwdef.h
 delete mode 100644 arch/unicore32/include/asm/pgtable.h
 delete mode 100644 arch/unicore32/include/asm/processor.h
 delete mode 100644 arch/unicore32/include/asm/ptrace.h
 delete mode 100644 arch/unicore32/include/asm/stacktrace.h
 delete mode 100644 arch/unicore32/include/asm/string.h
 delete mode 100644 arch/unicore32/include/asm/suspend.h
 delete mode 100644 arch/unicore32/include/asm/switch_to.h
 delete mode 100644 arch/unicore32/include/asm/syscall.h
 delete mode 100644 arch/unicore32/include/asm/thread_info.h
 delete mode 100644 arch/unicore32/include/asm/timex.h
 delete mode 100644 arch/unicore32/include/asm/tlb.h
 delete mode 100644 arch/unicore32/include/asm/tlbflush.h
 delete mode 100644 arch/unicore32/include/asm/traps.h
 delete mode 100644 arch/unicore32/include/asm/uaccess.h
 delete mode 100644 arch/unicore32/include/asm/vmalloc.h
 delete mode 100644 arch/unicore32/include/mach/PKUnity.h
 delete mode 100644 arch/unicore32/include/mach/bitfield.h
 delete mode 100644 arch/unicore32/include/mach/dma.h
 delete mode 100644 arch/unicore32/include/mach/hardware.h
 delete mode 100644 arch/unicore32/include/mach/map.h
 delete mode 100644 arch/unicore32/include/mach/memory.h
 delete mode 100644 arch/unicore32/include/mach/ocd.h
 delete mode 100644 arch/unicore32/include/mach/pm.h
 delete mode 100644 arch/unicore32/include/mach/regs-ac97.h
 delete mode 100644 arch/unicore32/include/mach/regs-dmac.h
 delete mode 100644 arch/unicore32/include/mach/regs-gpio.h
 delete mode 100644 arch/unicore32/include/mach/regs-i2c.h
 delete mode 100644 arch/unicore32/include/mach/regs-intc.h
 delete mode 100644 arch/unicore32/include/mach/regs-nand.h
 delete mode 100644 arch/unicore32/include/mach/regs-ost.h
 delete mode 100644 arch/unicore32/include/mach/regs-pci.h
 delete mode 100644 arch/unicore32/include/mach/regs-pm.h
 delete mode 100644 arch/unicore32/include/mach/regs-ps2.h
 delete mode 100644 arch/unicore32/include/mach/regs-resetc.h
 delete mode 100644 arch/unicore32/include/mach/regs-rtc.h
 delete mode 100644 arch/unicore32/include/mach/regs-sdc.h
 delete mode 100644 arch/unicore32/include/mach/regs-spi.h
 delete mode 100644 arch/unicore32/include/mach/regs-uart.h
 delete mode 100644 arch/unicore32/include/mach/regs-umal.h
 delete mode 100644 arch/unicore32/include/mach/regs-unigfx.h
 delete mode 100644 arch/unicore32/include/mach/uncompress.h
 delete mode 100644 arch/unicore32/include/uapi/asm/Kbuild
 delete mode 100644 arch/unicore32/include/uapi/asm/byteorder.h
 delete mode 100644 arch/unicore32/include/uapi/asm/ptrace.h
 delete mode 100644 arch/unicore32/include/uapi/asm/sigcontext.h
 delete mode 100644 arch/unicore32/include/uapi/asm/unistd.h
 delete mode 100644 arch/unicore32/kernel/Makefile
 delete mode 100644 arch/unicore32/kernel/asm-offsets.c
 delete mode 100644 arch/unicore32/kernel/clock.c
 delete mode 100644 arch/unicore32/kernel/debug-macro.S
 delete mode 100644 arch/unicore32/kernel/debug.S
 delete mode 100644 arch/unicore32/kernel/dma.c
 delete mode 100644 arch/unicore32/kernel/early_printk.c
 delete mode 100644 arch/unicore32/kernel/elf.c
 delete mode 100644 arch/unicore32/kernel/entry.S
 delete mode 100644 arch/unicore32/kernel/fpu-ucf64.c
 delete mode 100644 arch/unicore32/kernel/gpio.c
 delete mode 100644 arch/unicore32/kernel/head.S
 delete mode 100644 arch/unicore32/kernel/hibernate.c
 delete mode 100644 arch/unicore32/kernel/hibernate_asm.S
 delete mode 100644 arch/unicore32/kernel/irq.c
 delete mode 100644 arch/unicore32/kernel/ksyms.c
 delete mode 100644 arch/unicore32/kernel/ksyms.h
 delete mode 100644 arch/unicore32/kernel/module.c
 delete mode 100644 arch/unicore32/kernel/pci.c
 delete mode 100644 arch/unicore32/kernel/pm.c
 delete mode 100644 arch/unicore32/kernel/process.c
 delete mode 100644 arch/unicore32/kernel/ptrace.c
 delete mode 100644 arch/unicore32/kernel/puv3-core.c
 delete mode 100644 arch/unicore32/kernel/puv3-nb0916.c
 delete mode 100644 arch/unicore32/kernel/setup.c
 delete mode 100644 arch/unicore32/kernel/setup.h
 delete mode 100644 arch/unicore32/kernel/signal.c
 delete mode 100644 arch/unicore32/kernel/sleep.S
 delete mode 100644 arch/unicore32/kernel/stacktrace.c
 delete mode 100644 arch/unicore32/kernel/sys.c
 delete mode 100644 arch/unicore32/kernel/time.c
 delete mode 100644 arch/unicore32/kernel/traps.c
 delete mode 100644 arch/unicore32/kernel/vmlinux.lds.S
 delete mode 100644 arch/unicore32/lib/Makefile
 delete mode 100644 arch/unicore32/lib/backtrace.S
 delete mode 100644 arch/unicore32/lib/clear_user.S
 delete mode 100644 arch/unicore32/lib/copy_from_user.S
 delete mode 100644 arch/unicore32/lib/copy_page.S
 delete mode 100644 arch/unicore32/lib/copy_template.S
 delete mode 100644 arch/unicore32/lib/copy_to_user.S
 delete mode 100644 arch/unicore32/lib/delay.S
 delete mode 100644 arch/unicore32/lib/findbit.S
 delete mode 100644 arch/unicore32/lib/strncpy_from_user.S
 delete mode 100644 arch/unicore32/lib/strnlen_user.S
 delete mode 100644 arch/unicore32/mm/Kconfig
 delete mode 100644 arch/unicore32/mm/Makefile
 delete mode 100644 arch/unicore32/mm/alignment.c
 delete mode 100644 arch/unicore32/mm/cache-ucv2.S
 delete mode 100644 arch/unicore32/mm/extable.c
 delete mode 100644 arch/unicore32/mm/fault.c
 delete mode 100644 arch/unicore32/mm/flush.c
 delete mode 100644 arch/unicore32/mm/init.c
 delete mode 100644 arch/unicore32/mm/ioremap.c
 delete mode 100644 arch/unicore32/mm/mm.h
 delete mode 100644 arch/unicore32/mm/mmu.c
 delete mode 100644 arch/unicore32/mm/pgd.c
 delete mode 100644 arch/unicore32/mm/proc-macros.S
 delete mode 100644 arch/unicore32/mm/proc-syms.c
 delete mode 100644 arch/unicore32/mm/proc-ucv2.S
 delete mode 100644 arch/unicore32/mm/tlb-ucv2.S

diff --git a/Documentation/features/core/cBPF-JIT/arch-support.txt b/Documentation/features/core/cBPF-JIT/arch-support.txt
index 8620c38d4db0..399935616813 100644
--- a/Documentation/features/core/cBPF-JIT/arch-support.txt
+++ b/Documentation/features/core/cBPF-JIT/arch-support.txt
@@ -28,7 +28,6 @@
     |          sh: | TODO |
     |       sparc: |  ok  |
     |          um: | TODO |
-    |   unicore32: | TODO |
     |         x86: | TODO |
     |      xtensa: | TODO |
     -----------------------
diff --git a/Documentation/features/core/eBPF-JIT/arch-support.txt b/Documentation/features/core/eBPF-JIT/arch-support.txt
index 9ed964f65224..79409bfe0263 100644
--- a/Documentation/features/core/eBPF-JIT/arch-support.txt
+++ b/Documentation/features/core/eBPF-JIT/arch-support.txt
@@ -28,7 +28,6 @@
     |          sh: | TODO |
     |       sparc: |  ok  |
     |          um: | TODO |
-    |   unicore32: | TODO |
     |         x86: |  ok  |
     |      xtensa: | TODO |
     -----------------------
diff --git a/Documentation/features/core/generic-idle-thread/arch-support.txt b/Documentation/features/core/generic-idle-thread/arch-support.txt
index 365df2c2ff0b..9ea60e416efd 100644
--- a/Documentation/features/core/generic-idle-thread/arch-support.txt
+++ b/Documentation/features/core/generic-idle-thread/arch-support.txt
@@ -28,7 +28,6 @@
     |          sh: |  ok  |
     |       sparc: |  ok  |
     |          um: | TODO |
-    |   unicore32: | TODO |
     |         x86: |  ok  |
     |      xtensa: |  ok  |
     -----------------------
diff --git a/Documentation/features/core/jump-labels/arch-support.txt b/Documentation/features/core/jump-labels/arch-support.txt
index 632a1c7aefa2..f8ec5c13cde4 100644
--- a/Documentation/features/core/jump-labels/arch-support.txt
+++ b/Documentation/features/core/jump-labels/arch-support.txt
@@ -28,7 +28,6 @@
     |          sh: | TODO |
     |       sparc: |  ok  |
     |          um: | TODO |
-    |   unicore32: | TODO |
     |         x86: |  ok  |
     |      xtensa: |  ok  |
     -----------------------
diff --git a/Documentation/features/core/tracehook/arch-support.txt b/Documentation/features/core/tracehook/arch-support.txt
index 964667052eda..cd3510e2eedb 100644
--- a/Documentation/features/core/tracehook/arch-support.txt
+++ b/Documentation/features/core/tracehook/arch-support.txt
@@ -28,7 +28,6 @@
     |          sh: |  ok  |
     |       sparc: |  ok  |
     |          um: | TODO |
-    |   unicore32: | TODO |
     |         x86: |  ok  |
     |      xtensa: |  ok  |
     -----------------------
diff --git a/Documentation/features/debug/KASAN/arch-support.txt b/Documentation/features/debug/KASAN/arch-support.txt
index 6ff38548923e..c3fe9b266e7b 100644
--- a/Documentation/features/debug/KASAN/arch-support.txt
+++ b/Documentation/features/debug/KASAN/arch-support.txt
@@ -28,7 +28,6 @@
     |          sh: | TODO |
     |       sparc: | TODO |
     |          um: | TODO |
-    |   unicore32: | TODO |
     |         x86: |  ok  |
     |      xtensa: |  ok  |
     -----------------------
diff --git a/Documentation/features/debug/debug-vm-pgtable/arch-support.txt b/Documentation/features/debug/debug-vm-pgtable/arch-support.txt
index c527d05c0459..ca6bacb1e99e 100644
--- a/Documentation/features/debug/debug-vm-pgtable/arch-support.txt
+++ b/Documentation/features/debug/debug-vm-pgtable/arch-support.txt
@@ -28,7 +28,6 @@
     |          sh: | TODO |
     |       sparc: | TODO |
     |          um: | TODO |
-    |   unicore32: | TODO |
     |         x86: |  ok  |
     |      xtensa: | TODO |
     -----------------------
diff --git a/Documentation/features/debug/gcov-profile-all/arch-support.txt b/Documentation/features/debug/gcov-profile-all/arch-support.txt
index 210256f6a4cf..7563a494ddb8 100644
--- a/Documentation/features/debug/gcov-profile-all/arch-support.txt
+++ b/Documentation/features/debug/gcov-profile-all/arch-support.txt
@@ -28,7 +28,6 @@
     |          sh: |  ok  |
     |       sparc: | TODO |
     |          um: | TODO |
-    |   unicore32: | TODO |
     |         x86: |  ok  |
     |      xtensa: | TODO |
     -----------------------
diff --git a/Documentation/features/debug/kgdb/arch-support.txt b/Documentation/features/debug/kgdb/arch-support.txt
index 38c40cfa0578..4b0a1d0d6ba4 100644
--- a/Documentation/features/debug/kgdb/arch-support.txt
+++ b/Documentation/features/debug/kgdb/arch-support.txt
@@ -28,7 +28,6 @@
     |          sh: |  ok  |
     |       sparc: |  ok  |
     |          um: | TODO |
-    |   unicore32: | TODO |
     |         x86: |  ok  |
     |      xtensa: | TODO |
     -----------------------
diff --git a/Documentation/features/debug/kprobes-on-ftrace/arch-support.txt b/Documentation/features/debug/kprobes-on-ftrace/arch-support.txt
index 97cd7aa74905..6225cfe0c5bf 100644
--- a/Documentation/features/debug/kprobes-on-ftrace/arch-support.txt
+++ b/Documentation/features/debug/kprobes-on-ftrace/arch-support.txt
@@ -28,7 +28,6 @@
     |          sh: | TODO |
     |       sparc: | TODO |
     |          um: | TODO |
-    |   unicore32: | TODO |
     |         x86: |  ok  |
     |      xtensa: | TODO |
     -----------------------
diff --git a/Documentation/features/debug/kprobes/arch-support.txt b/Documentation/features/debug/kprobes/arch-support.txt
index 8b316c6e03d4..371f0ac488f5 100644
--- a/Documentation/features/debug/kprobes/arch-support.txt
+++ b/Documentation/features/debug/kprobes/arch-support.txt
@@ -28,7 +28,6 @@
     |          sh: |  ok  |
     |       sparc: |  ok  |
     |          um: | TODO |
-    |   unicore32: | TODO |
     |         x86: |  ok  |
     |      xtensa: | TODO |
     -----------------------
diff --git a/Documentation/features/debug/kretprobes/arch-support.txt b/Documentation/features/debug/kretprobes/arch-support.txt
index b805aada395e..38e95251deed 100644
--- a/Documentation/features/debug/kretprobes/arch-support.txt
+++ b/Documentation/features/debug/kretprobes/arch-support.txt
@@ -28,7 +28,6 @@
     |          sh: |  ok  |
     |       sparc: |  ok  |
     |          um: | TODO |
-    |   unicore32: | TODO |
     |         x86: |  ok  |
     |      xtensa: | TODO |
     -----------------------
diff --git a/Documentation/features/debug/optprobes/arch-support.txt b/Documentation/features/debug/optprobes/arch-support.txt
index fb297a88f62c..7f4a20e6a12b 100644
--- a/Documentation/features/debug/optprobes/arch-support.txt
+++ b/Documentation/features/debug/optprobes/arch-support.txt
@@ -28,7 +28,6 @@
     |          sh: | TODO |
     |       sparc: | TODO |
     |          um: | TODO |
-    |   unicore32: | TODO |
     |         x86: |  ok  |
     |      xtensa: | TODO |
     -----------------------
diff --git a/Documentation/features/debug/stackprotector/arch-support.txt b/Documentation/features/debug/stackprotector/arch-support.txt
index 12410f606edc..3db4763aa3f5 100644
--- a/Documentation/features/debug/stackprotector/arch-support.txt
+++ b/Documentation/features/debug/stackprotector/arch-support.txt
@@ -28,7 +28,6 @@
     |          sh: |  ok  |
     |       sparc: | TODO |
     |          um: | TODO |
-    |   unicore32: | TODO |
     |         x86: |  ok  |
     |      xtensa: |  ok  |
     -----------------------
diff --git a/Documentation/features/debug/uprobes/arch-support.txt b/Documentation/features/debug/uprobes/arch-support.txt
index be8acbb95b54..43cac6ee0c68 100644
--- a/Documentation/features/debug/uprobes/arch-support.txt
+++ b/Documentation/features/debug/uprobes/arch-support.txt
@@ -28,7 +28,6 @@
     |          sh: | TODO |
     |       sparc: |  ok  |
     |          um: | TODO |
-    |   unicore32: | TODO |
     |         x86: |  ok  |
     |      xtensa: | TODO |
     -----------------------
diff --git a/Documentation/features/debug/user-ret-profiler/arch-support.txt b/Documentation/features/debug/user-ret-profiler/arch-support.txt
index 6bfa36b0e017..d636ed0e679f 100644
--- a/Documentation/features/debug/user-ret-profiler/arch-support.txt
+++ b/Documentation/features/debug/user-ret-profiler/arch-support.txt
@@ -28,7 +28,6 @@
     |          sh: | TODO |
     |       sparc: | TODO |
     |          um: | TODO |
-    |   unicore32: | TODO |
     |         x86: |  ok  |
     |      xtensa: | TODO |
     -----------------------
diff --git a/Documentation/features/io/dma-contiguous/arch-support.txt b/Documentation/features/io/dma-contiguous/arch-support.txt
index 895c3b0f6492..dfc93d074e3d 100644
--- a/Documentation/features/io/dma-contiguous/arch-support.txt
+++ b/Documentation/features/io/dma-contiguous/arch-support.txt
@@ -28,7 +28,6 @@
     |          sh: | TODO |
     |       sparc: | TODO |
     |          um: | TODO |
-    |   unicore32: | TODO |
     |         x86: |  ok  |
     |      xtensa: |  ok  |
     -----------------------
diff --git a/Documentation/features/locking/cmpxchg-local/arch-support.txt b/Documentation/features/locking/cmpxchg-local/arch-support.txt
index 242ff5a6586e..1815c7fed06d 100644
--- a/Documentation/features/locking/cmpxchg-local/arch-support.txt
+++ b/Documentation/features/locking/cmpxchg-local/arch-support.txt
@@ -28,7 +28,6 @@
     |          sh: | TODO |
     |       sparc: | TODO |
     |          um: | TODO |
-    |   unicore32: | TODO |
     |         x86: |  ok  |
     |      xtensa: | TODO |
     -----------------------
diff --git a/Documentation/features/locking/lockdep/arch-support.txt b/Documentation/features/locking/lockdep/arch-support.txt
index 98cb9d85c55d..4f844ecd0680 100644
--- a/Documentation/features/locking/lockdep/arch-support.txt
+++ b/Documentation/features/locking/lockdep/arch-support.txt
@@ -28,7 +28,6 @@
     |          sh: |  ok  |
     |       sparc: |  ok  |
     |          um: |  ok  |
-    |   unicore32: |  ok  |
     |         x86: |  ok  |
     |      xtensa: |  ok  |
     -----------------------
diff --git a/Documentation/features/locking/queued-rwlocks/arch-support.txt b/Documentation/features/locking/queued-rwlocks/arch-support.txt
index ee922746a64c..5c6bcfcf8e1f 100644
--- a/Documentation/features/locking/queued-rwlocks/arch-support.txt
+++ b/Documentation/features/locking/queued-rwlocks/arch-support.txt
@@ -28,7 +28,6 @@
     |          sh: | TODO |
     |       sparc: |  ok  |
     |          um: | TODO |
-    |   unicore32: | TODO |
     |         x86: |  ok  |
     |      xtensa: |  ok  |
     -----------------------
diff --git a/Documentation/features/locking/queued-spinlocks/arch-support.txt b/Documentation/features/locking/queued-spinlocks/arch-support.txt
index c52116c1a049..b55e420a34ea 100644
--- a/Documentation/features/locking/queued-spinlocks/arch-support.txt
+++ b/Documentation/features/locking/queued-spinlocks/arch-support.txt
@@ -28,7 +28,6 @@
     |          sh: | TODO |
     |       sparc: |  ok  |
     |          um: | TODO |
-    |   unicore32: | TODO |
     |         x86: |  ok  |
     |      xtensa: |  ok  |
     -----------------------
diff --git a/Documentation/features/perf/kprobes-event/arch-support.txt b/Documentation/features/perf/kprobes-event/arch-support.txt
index 518f352fc727..04c17c2106a4 100644
--- a/Documentation/features/perf/kprobes-event/arch-support.txt
+++ b/Documentation/features/perf/kprobes-event/arch-support.txt
@@ -28,7 +28,6 @@
     |          sh: |  ok  |
     |       sparc: |  ok  |
     |          um: | TODO |
-    |   unicore32: | TODO |
     |         x86: |  ok  |
     |      xtensa: | TODO |
     -----------------------
diff --git a/Documentation/features/perf/perf-regs/arch-support.txt b/Documentation/features/perf/perf-regs/arch-support.txt
index c22cd6f8aa5e..e7450fbb8253 100644
--- a/Documentation/features/perf/perf-regs/arch-support.txt
+++ b/Documentation/features/perf/perf-regs/arch-support.txt
@@ -28,7 +28,6 @@
     |          sh: | TODO |
     |       sparc: | TODO |
     |          um: | TODO |
-    |   unicore32: | TODO |
     |         x86: |  ok  |
     |      xtensa: | TODO |
     -----------------------
diff --git a/Documentation/features/perf/perf-stackdump/arch-support.txt b/Documentation/features/perf/perf-stackdump/arch-support.txt
index 527fe4d0b074..98e79d128d9b 100644
--- a/Documentation/features/perf/perf-stackdump/arch-support.txt
+++ b/Documentation/features/perf/perf-stackdump/arch-support.txt
@@ -28,7 +28,6 @@
     |          sh: | TODO |
     |       sparc: | TODO |
     |          um: | TODO |
-    |   unicore32: | TODO |
     |         x86: |  ok  |
     |      xtensa: | TODO |
     -----------------------
diff --git a/Documentation/features/sched/membarrier-sync-core/arch-support.txt b/Documentation/features/sched/membarrier-sync-core/arch-support.txt
index 8a521a622966..68658a6f8c5b 100644
--- a/Documentation/features/sched/membarrier-sync-core/arch-support.txt
+++ b/Documentation/features/sched/membarrier-sync-core/arch-support.txt
@@ -51,7 +51,6 @@
     |          sh: | TODO |
     |       sparc: | TODO |
     |          um: | TODO |
-    |   unicore32: | TODO |
     |         x86: |  ok  |
     |      xtensa: | TODO |
     -----------------------
diff --git a/Documentation/features/sched/numa-balancing/arch-support.txt b/Documentation/features/sched/numa-balancing/arch-support.txt
index 350823692f28..964457ad26c1 100644
--- a/Documentation/features/sched/numa-balancing/arch-support.txt
+++ b/Documentation/features/sched/numa-balancing/arch-support.txt
@@ -28,7 +28,6 @@
     |          sh: |  ..  |
     |       sparc: | TODO |
     |          um: |  ..  |
-    |   unicore32: |  ..  |
     |         x86: |  ok  |
     |      xtensa: |  ..  |
     -----------------------
diff --git a/Documentation/features/seccomp/seccomp-filter/arch-support.txt b/Documentation/features/seccomp/seccomp-filter/arch-support.txt
index c7b837f735b1..f54ddfc06a12 100644
--- a/Documentation/features/seccomp/seccomp-filter/arch-support.txt
+++ b/Documentation/features/seccomp/seccomp-filter/arch-support.txt
@@ -28,7 +28,6 @@
     |          sh: | TODO |
     |       sparc: | TODO |
     |          um: |  ok  |
-    |   unicore32: | TODO |
     |         x86: |  ok  |
     |      xtensa: | TODO |
     -----------------------
diff --git a/Documentation/features/time/arch-tick-broadcast/arch-support.txt b/Documentation/features/time/arch-tick-broadcast/arch-support.txt
index 593536f7925b..4d11cbb3c09b 100644
--- a/Documentation/features/time/arch-tick-broadcast/arch-support.txt
+++ b/Documentation/features/time/arch-tick-broadcast/arch-support.txt
@@ -28,7 +28,6 @@
     |          sh: |  ok  |
     |       sparc: | TODO |
     |          um: | TODO |
-    |   unicore32: | TODO |
     |         x86: | TODO |
     |      xtensa: | TODO |
     -----------------------
diff --git a/Documentation/features/time/clockevents/arch-support.txt b/Documentation/features/time/clockevents/arch-support.txt
index 7a27157da408..8287b6aa522e 100644
--- a/Documentation/features/time/clockevents/arch-support.txt
+++ b/Documentation/features/time/clockevents/arch-support.txt
@@ -28,7 +28,6 @@
     |          sh: |  ok  |
     |       sparc: |  ok  |
     |          um: |  ok  |
-    |   unicore32: |  ok  |
     |         x86: |  ok  |
     |      xtensa: |  ok  |
     -----------------------
diff --git a/Documentation/features/time/context-tracking/arch-support.txt b/Documentation/features/time/context-tracking/arch-support.txt
index 048bfb6d3872..a71f3a945285 100644
--- a/Documentation/features/time/context-tracking/arch-support.txt
+++ b/Documentation/features/time/context-tracking/arch-support.txt
@@ -28,7 +28,6 @@
     |          sh: | TODO |
     |       sparc: |  ok  |
     |          um: | TODO |
-    |   unicore32: | TODO |
     |         x86: |  ok  |
     |      xtensa: | TODO |
     -----------------------
diff --git a/Documentation/features/time/irq-time-acct/arch-support.txt b/Documentation/features/time/irq-time-acct/arch-support.txt
index a14bbad8e948..d9082b91f10e 100644
--- a/Documentation/features/time/irq-time-acct/arch-support.txt
+++ b/Documentation/features/time/irq-time-acct/arch-support.txt
@@ -28,7 +28,6 @@
     |          sh: | TODO |
     |       sparc: |  ..  |
     |          um: | TODO |
-    |   unicore32: | TODO |
     |         x86: |  ok  |
     |      xtensa: |  ok  |
     -----------------------
diff --git a/Documentation/features/time/modern-timekeeping/arch-support.txt b/Documentation/features/time/modern-timekeeping/arch-support.txt
index 1d46da165b75..a84c3b9d9a94 100644
--- a/Documentation/features/time/modern-timekeeping/arch-support.txt
+++ b/Documentation/features/time/modern-timekeeping/arch-support.txt
@@ -28,7 +28,6 @@
     |          sh: |  ok  |
     |       sparc: |  ok  |
     |          um: |  ok  |
-    |   unicore32: |  ok  |
     |         x86: |  ok  |
     |      xtensa: |  ok  |
     -----------------------
diff --git a/Documentation/features/time/virt-cpuacct/arch-support.txt b/Documentation/features/time/virt-cpuacct/arch-support.txt
index fb0d0cab9cab..56b372da6b01 100644
--- a/Documentation/features/time/virt-cpuacct/arch-support.txt
+++ b/Documentation/features/time/virt-cpuacct/arch-support.txt
@@ -28,7 +28,6 @@
     |          sh: | TODO |
     |       sparc: |  ok  |
     |          um: | TODO |
-    |   unicore32: | TODO |
     |         x86: |  ok  |
     |      xtensa: | TODO |
     -----------------------
diff --git a/Documentation/features/vm/ELF-ASLR/arch-support.txt b/Documentation/features/vm/ELF-ASLR/arch-support.txt
index adc25878d217..eccda0732474 100644
--- a/Documentation/features/vm/ELF-ASLR/arch-support.txt
+++ b/Documentation/features/vm/ELF-ASLR/arch-support.txt
@@ -28,7 +28,6 @@
     |          sh: | TODO |
     |       sparc: | TODO |
     |          um: | TODO |
-    |   unicore32: | TODO |
     |         x86: |  ok  |
     |      xtensa: | TODO |
     -----------------------
diff --git a/Documentation/features/vm/PG_uncached/arch-support.txt b/Documentation/features/vm/PG_uncached/arch-support.txt
index f05588f9e4b4..c74e3f8040e1 100644
--- a/Documentation/features/vm/PG_uncached/arch-support.txt
+++ b/Documentation/features/vm/PG_uncached/arch-support.txt
@@ -28,7 +28,6 @@
     |          sh: | TODO |
     |       sparc: | TODO |
     |          um: | TODO |
-    |   unicore32: | TODO |
     |         x86: |  ok  |
     |      xtensa: | TODO |
     -----------------------
diff --git a/Documentation/features/vm/THP/arch-support.txt b/Documentation/features/vm/THP/arch-support.txt
index cdfe8925f881..1c0b95f2b40d 100644
--- a/Documentation/features/vm/THP/arch-support.txt
+++ b/Documentation/features/vm/THP/arch-support.txt
@@ -28,7 +28,6 @@
     |          sh: |  ..  |
     |       sparc: |  ok  |
     |          um: |  ..  |
-    |   unicore32: |  ..  |
     |         x86: |  ok  |
     |      xtensa: |  ..  |
     -----------------------
diff --git a/Documentation/features/vm/TLB/arch-support.txt b/Documentation/features/vm/TLB/arch-support.txt
index 2bdd3b6cee3c..30f75a79ce01 100644
--- a/Documentation/features/vm/TLB/arch-support.txt
+++ b/Documentation/features/vm/TLB/arch-support.txt
@@ -28,7 +28,6 @@
     |          sh: | TODO |
     |       sparc: | TODO |
     |          um: |  ..  |
-    |   unicore32: |  ..  |
     |         x86: |  ok  |
     |      xtensa: | TODO |
     -----------------------
diff --git a/Documentation/features/vm/huge-vmap/arch-support.txt b/Documentation/features/vm/huge-vmap/arch-support.txt
index 8525f1981f19..c5ff3a427722 100644
--- a/Documentation/features/vm/huge-vmap/arch-support.txt
+++ b/Documentation/features/vm/huge-vmap/arch-support.txt
@@ -28,7 +28,6 @@
     |          sh: | TODO |
     |       sparc: | TODO |
     |          um: | TODO |
-    |   unicore32: | TODO |
     |         x86: |  ok  |
     |      xtensa: | TODO |
     -----------------------
diff --git a/Documentation/features/vm/ioremap_prot/arch-support.txt b/Documentation/features/vm/ioremap_prot/arch-support.txt
index 3a6b87de6a19..1cb7406cd858 100644
--- a/Documentation/features/vm/ioremap_prot/arch-support.txt
+++ b/Documentation/features/vm/ioremap_prot/arch-support.txt
@@ -28,7 +28,6 @@
     |          sh: |  ok  |
     |       sparc: | TODO |
     |          um: | TODO |
-    |   unicore32: | TODO |
     |         x86: |  ok  |
     |      xtensa: | TODO |
     -----------------------
diff --git a/Documentation/features/vm/pte_special/arch-support.txt b/Documentation/features/vm/pte_special/arch-support.txt
index 2e017387e228..13d0e1e17001 100644
--- a/Documentation/features/vm/pte_special/arch-support.txt
+++ b/Documentation/features/vm/pte_special/arch-support.txt
@@ -28,7 +28,6 @@
     |          sh: |  ok  |
     |       sparc: |  ok  |
     |          um: | TODO |
-    |   unicore32: | TODO |
     |         x86: |  ok  |
     |      xtensa: | TODO |
     -----------------------
diff --git a/MAINTAINERS b/MAINTAINERS
index 496fd4eafb68..1de95aa44bbb 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -17532,13 +17532,6 @@ L:	linux-fsdevel@vger.kernel.org
 S:	Supported
 F:	fs/unicode/
 
-UNICORE32 ARCHITECTURE
-M:	Guan Xuetao <gxt@pku.edu.cn>
-S:	Maintained
-W:	http://mprc.pku.edu.cn/~guanxuetao/linux
-T:	git git://github.com/gxt/linux.git
-F:	arch/unicore32/
-
 UNIFDEF
 M:	Tony Finch <dot@dotat.at>
 S:	Maintained
diff --git a/arch/unicore32/.gitignore b/arch/unicore32/.gitignore
deleted file mode 100644
index e82f3fb57ba0..000000000000
--- a/arch/unicore32/.gitignore
+++ /dev/null
@@ -1,22 +0,0 @@
-# SPDX-License-Identifier: GPL-2.0-only
-#
-# Generated include files
-#
-include/generated
-#
-# Generated ld script file
-#
-kernel/vmlinux.lds
-#
-# Generated images in boot
-#
-boot/Image
-boot/zImage
-boot/uImage
-#
-# Generated files in boot/compressed
-#
-boot/compressed/piggy.S
-boot/compressed/piggy.gzip
-boot/compressed/vmlinux
-boot/compressed/vmlinux.lds
diff --git a/arch/unicore32/Kconfig b/arch/unicore32/Kconfig
deleted file mode 100644
index 11ba1839d198..000000000000
--- a/arch/unicore32/Kconfig
+++ /dev/null
@@ -1,200 +0,0 @@
-# SPDX-License-Identifier: GPL-2.0
-config UNICORE32
-	def_bool y
-	select ARCH_32BIT_OFF_T
-	select ARCH_HAS_DEVMEM_IS_ALLOWED
-	select ARCH_HAS_KEEPINITRD
-	select ARCH_MIGHT_HAVE_PC_PARPORT
-	select ARCH_MIGHT_HAVE_PC_SERIO
-	select HAVE_KERNEL_GZIP
-	select HAVE_KERNEL_BZIP2
-	select GENERIC_ATOMIC64
-	select HAVE_KERNEL_LZO
-	select HAVE_KERNEL_LZMA
-	select HAVE_PCI
-	select VIRT_TO_BUS
-	select ARCH_HAVE_CUSTOM_GPIO_H
-	select GENERIC_FIND_FIRST_BIT
-	select GENERIC_IRQ_PROBE
-	select GENERIC_IRQ_SHOW
-	select ARCH_WANT_FRAME_POINTERS
-	select GENERIC_IOMAP
-	select MODULES_USE_ELF_REL
-	select NEED_DMA_MAP_STATE
-	select MMU_GATHER_NO_RANGE if MMU
-	help
-	  UniCore-32 is 32-bit Instruction Set Architecture,
-	  including a series of low-power-consumption RISC chip
-	  designs licensed by PKUnity Ltd.
-	  Please see web page at <http://www.pkunity.com/>.
-
-config GENERIC_CSUM
-	def_bool y
-
-config NO_IOPORT_MAP
-	bool
-
-config STACKTRACE_SUPPORT
-	def_bool y
-
-config LOCKDEP_SUPPORT
-	def_bool y
-
-config ARCH_HAS_ILOG2_U32
-	bool
-
-config ARCH_HAS_ILOG2_U64
-	bool
-
-config GENERIC_HWEIGHT
-	def_bool y
-
-config GENERIC_CALIBRATE_DELAY
-	def_bool y
-
-config ARCH_MAY_HAVE_PC_FDC
-	bool
-
-config ZONE_DMA
-	def_bool y
-
-menu "System Type"
-
-config MMU
-	def_bool y
-
-config ARCH_FPGA
-	bool
-
-config ARCH_PUV3
-	def_bool y
-	select CPU_UCV2
-	select GENERIC_CLOCKEVENTS
-	select HAVE_LEGACY_CLK
-	select GPIOLIB
-
-# CONFIGs for ARCH_PUV3
-
-if ARCH_PUV3
-
-choice
-	prompt "Board Selection"
-	default PUV3_DB0913
-
-config PUV3_FPGA_DLX200
-	select ARCH_FPGA
-	bool "FPGA board"
-
-config PUV3_DB0913
-	bool "DEBUG board (0913)"
-
-config PUV3_NB0916
-	bool "NetBook board (0916)"
-	select PWM
-	select PWM_PUV3
-
-config PUV3_SMW0919
-	bool "Security Mini-Workstation board (0919)"
-
-endchoice
-
-config PUV3_PM
-	def_bool y if !ARCH_FPGA
-
-endif
-
-source "arch/unicore32/mm/Kconfig"
-
-comment "Floating point support"
-
-config UNICORE_FPU_F64
-	def_bool y if !ARCH_FPGA
-
-endmenu
-
-menu "Kernel Features"
-
-source "kernel/Kconfig.hz"
-
-config LEDS
-	def_bool y
-	depends on GPIOLIB
-
-config ALIGNMENT_TRAP
-	def_bool y
-	help
-	  Unicore processors can not fetch/store information which is not
-	  naturally aligned on the bus, i.e., a 4 byte fetch must start at an
-	  address divisible by 4. On 32-bit Unicore processors, these non-aligned
-	  fetch/store instructions will be emulated in software if you say
-	  here, which has a severe performance impact. This is necessary for
-	  correct operation of some network protocols. With an IP-only
-	  configuration it is safe to say N, otherwise say Y.
-
-endmenu
-
-menu "Boot options"
-
-config CMDLINE
-	string "Default kernel command string"
-	default ""
-
-config CMDLINE_FORCE
-	bool "Always use the default kernel command string"
-	depends on CMDLINE != ""
-	help
-	  Always use the default kernel command string, even if the boot
-	  loader passes other arguments to the kernel.
-	  This is useful if you cannot or don't want to change the
-	  command-line options your boot loader passes to the kernel.
-
-	  If unsure, say N.
-
-endmenu
-
-menu "Power management options"
-
-source "kernel/power/Kconfig"
-
-source "drivers/cpufreq/Kconfig"
-
-config ARCH_SUSPEND_POSSIBLE
-	def_bool y if !ARCH_FPGA
-
-config ARCH_HIBERNATION_POSSIBLE
-	def_bool y if !ARCH_FPGA
-
-endmenu
-
-if ARCH_PUV3
-
-config PUV3_GPIO
-	bool
-	depends on !ARCH_FPGA
-	select GPIO_SYSFS
-	default y
-
-if PUV3_NB0916
-
-menu "PKUnity NetBook-0916 Features"
-
-config I2C_BATTERY_BQ27200
-	tristate "I2C Battery BQ27200 Support"
-	select I2C_PUV3
-	select POWER_SUPPLY
-	select BATTERY_BQ27XXX
-
-config I2C_EEPROM_AT24
-	tristate "I2C EEPROMs AT24 support"
-	select I2C_PUV3
-	select EEPROM_AT24
-
-config LCD_BACKLIGHT
-	tristate "LCD Backlight support"
-	select BACKLIGHT_PWM
-
-endmenu
-
-endif
-
-endif
diff --git a/arch/unicore32/Kconfig.debug b/arch/unicore32/Kconfig.debug
deleted file mode 100644
index ca0ff97657ef..000000000000
--- a/arch/unicore32/Kconfig.debug
+++ /dev/null
@@ -1,29 +0,0 @@
-# SPDX-License-Identifier: GPL-2.0
-
-config EARLY_PRINTK
-	def_bool DEBUG_OCD
-	help
-	  Write kernel log output directly into the ocd or to a serial port.
-
-	  This is useful for kernel debugging when your machine crashes very
-	  early before the console code is initialized. For normal operation
-	  it is not recommended because it looks ugly and doesn't cooperate
-	  with klogd/syslogd or the X server. You should normally N here,
-	  unless you want to debug such a crash.
-
-# These options are only for real kernel hackers who want to get their hands dirty.
-config DEBUG_LL
-	bool "Kernel low-level debugging functions"
-	depends on DEBUG_KERNEL
-	help
-	  Say Y here to include definitions of printascii, printch, printhex
-	  in the kernel.  This is helpful if you are debugging code that
-	  executes before the console is initialized.
-
-config DEBUG_OCD
-	bool "Kernel low-level debugging via On-Chip-Debugger"
-	depends on DEBUG_LL
-	default y
-	help
-	  Say Y here if you want the debug print routines to direct their
-	  output to the UniCore On-Chip-Debugger channel using CP #1.
diff --git a/arch/unicore32/Makefile b/arch/unicore32/Makefile
deleted file mode 100644
index 390819947c37..000000000000
--- a/arch/unicore32/Makefile
+++ /dev/null
@@ -1,59 +0,0 @@
-#
-# arch/unicore32/Makefile
-#
-# This file is included by the global makefile so that you can add your own
-# architecture-specific flags and dependencies.
-#
-# This file is subject to the terms and conditions of the GNU General Public
-# License.  See the file "COPYING" in the main directory of this archive
-# for more details.
-#
-# Copyright (C) 2002~2010 by Guan Xue-tao
-#
-ifneq ($(SUBARCH),$(ARCH))
-	ifeq ($(CROSS_COMPILE),)
-		CROSS_COMPILE := $(call cc-cross-prefix, unicore32-linux-)
-	endif
-endif
-
-LDFLAGS_vmlinux		:= -p --no-undefined -X
-
-OBJCOPYFLAGS		:= -O binary -R .note -R .note.gnu.build-id -R .comment -S
-
-# Never generate .eh_frame
-KBUILD_CFLAGS		+= $(call cc-option,-fno-dwarf2-cfi-asm)
-
-# Never use hard float in kernel
-KBUILD_CFLAGS		+= -msoft-float
-
-ifeq ($(CONFIG_FRAME_POINTER),y)
-KBUILD_CFLAGS		+= -mno-sched-prolog
-endif
-
-CHECKFLAGS		+= -D__unicore32__
-
-head-y			:= arch/unicore32/kernel/head.o
-
-core-y			+= arch/unicore32/kernel/
-core-y			+= arch/unicore32/mm/
-
-libs-y			+= arch/unicore32/lib/
-
-boot			:= arch/unicore32/boot
-
-# Default target when executing plain make
-KBUILD_IMAGE		:= $(boot)/zImage
-
-all:	zImage
-
-zImage Image uImage: vmlinux
-	$(Q)$(MAKE) $(build)=$(boot) $(boot)/$@
-
-archclean:
-	$(Q)$(MAKE) $(clean)=$(boot)
-
-define archhelp
-  echo  '* zImage        - Compressed kernel image (arch/$(ARCH)/boot/zImage)'
-  echo  '  Image         - Uncompressed kernel image (arch/$(ARCH)/boot/Image)'
-  echo  '  uImage        - U-Boot wrapped zImage'
-endef
diff --git a/arch/unicore32/boot/Makefile b/arch/unicore32/boot/Makefile
deleted file mode 100644
index 828855007b29..000000000000
--- a/arch/unicore32/boot/Makefile
+++ /dev/null
@@ -1,39 +0,0 @@
-#
-# arch/unicore32/boot/Makefile
-#
-# This file is included by the global makefile so that you can add your own
-# architecture-specific flags and dependencies.
-#
-# This file is subject to the terms and conditions of the GNU General Public
-# License.  See the file "COPYING" in the main directory of this archive
-# for more details.
-#
-# Copyright (C) 2001~2010 GUAN Xue-tao
-#
-
-targets := Image zImage uImage
-
-$(obj)/Image: vmlinux FORCE
-	$(call if_changed,objcopy)
-	@echo '  Kernel: $@ is ready'
-
-$(obj)/compressed/vmlinux: $(obj)/Image FORCE
-	$(Q)$(MAKE) $(build)=$(obj)/compressed $@
-
-$(obj)/zImage: $(obj)/compressed/vmlinux FORCE
-	$(call if_changed,objcopy)
-	@echo '  Kernel: $@ is ready'
-
-UIMAGE_ARCH = unicore
-UIMAGE_LOADADDR = 0x0
-
-$(obj)/uImage: $(obj)/zImage FORCE
-	$(call if_changed,uimage)
-	@echo '  Image $@ is ready'
-
-PHONY += initrd
-initrd:
-	@test "$(INITRD)" != "" || \
-	(echo You must specify INITRD; exit -1)
-
-subdir- := compressed
diff --git a/arch/unicore32/boot/compressed/Makefile b/arch/unicore32/boot/compressed/Makefile
deleted file mode 100644
index 150fafc32fb0..000000000000
--- a/arch/unicore32/boot/compressed/Makefile
+++ /dev/null
@@ -1,64 +0,0 @@
-#
-# linux/arch/unicore32/boot/compressed/Makefile
-#
-# create a compressed vmlinuz image from the original vmlinux
-#
-# This file is subject to the terms and conditions of the GNU General Public
-# License.  See the file "COPYING" in the main directory of this archive
-# for more details.
-#
-# Copyright (C) 2001~2010 GUAN Xue-tao
-#
-
-ccflags-y	:= -fpic -fno-builtin
-asflags-y	:= -Wa,-march=all
-
-OBJS		:= misc.o
-
-# font.c and font.o
-CFLAGS_font.o	:= -Dstatic=
-$(obj)/font.c: $(srctree)/lib/fonts/font_8x8.c
-	$(call cmd,shipped)
-
-# piggy.S and piggy.o
-suffix_$(CONFIG_KERNEL_GZIP)	:= gzip
-suffix_$(CONFIG_KERNEL_BZIP2)	:= bz2
-suffix_$(CONFIG_KERNEL_LZO)	:= lzo
-suffix_$(CONFIG_KERNEL_LZMA)	:= lzma
-
-$(obj)/piggy.$(suffix_y): $(obj)/../Image FORCE
-	$(call if_changed,$(suffix_y))
-
-SEDFLAGS_piggy	= s/DECOMP_SUFFIX/$(suffix_y)/
-$(obj)/piggy.S: $(obj)/piggy.S.in
-	@sed "$(SEDFLAGS_piggy)" < $< > $@
-
-$(obj)/piggy.o:  $(obj)/piggy.$(suffix_y) $(obj)/piggy.S FORCE
-
-targets		:= vmlinux vmlinux.lds font.o font.c head.o misc.o \
-			piggy.$(suffix_y) piggy.o piggy.S \
-
-# Make sure files are removed during clean
-extra-y		+= piggy.gzip piggy.bz2 piggy.lzo piggy.lzma
-
-# ?
-LDFLAGS_vmlinux += -p
-# Report unresolved symbol references
-LDFLAGS_vmlinux += --no-undefined
-# Delete all temporary local symbols
-LDFLAGS_vmlinux += -X
-# Next argument is a linker script
-LDFLAGS_vmlinux += -T
-
-# For uidivmod
-$(obj)/vmlinux: $(obj)/vmlinux.lds $(obj)/head.o $(obj)/piggy.o \
-		$(obj)/misc.o FORCE
-	$(call if_changed,ld)
-
-# We now have a PIC decompressor implementation.  Decompressors running
-# from RAM should not define ZTEXTADDR.  Decompressors running directly
-# from ROM or Flash must define ZTEXTADDR (preferably via the config)
-ZTEXTADDR	:= 0x03000000
-ZBSSADDR	:= ALIGN(4)
-
-CPPFLAGS_vmlinux.lds = -DTEXT_START="$(ZTEXTADDR)" -DBSS_START="$(ZBSSADDR)"
diff --git a/arch/unicore32/boot/compressed/head.S b/arch/unicore32/boot/compressed/head.S
deleted file mode 100644
index 5f72662cd294..000000000000
--- a/arch/unicore32/boot/compressed/head.S
+++ /dev/null
@@ -1,201 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/*
- * linux/arch/unicore32/boot/compressed/head.S
- *
- * Code specific to PKUnity SoC and UniCore ISA
- *
- * Copyright (C) 2001-2010 GUAN Xue-tao
- */
-#include <linux/linkage.h>
-#include <mach/memory.h>
-
-#define csub	cmpsub
-#define cand	cmpand
-#define nop8	nop; nop; nop; nop; nop; nop; nop; nop
-
-		.section ".start", #alloc, #execinstr
-		.text
-start:
-		.type	start,#function
-
-		/* Initialize ASR, PRIV mode and INTR off */
-		mov	r0, #0xD3
-		mov.a	asr, r0
-
-		adr	r0, LC0
-		ldm	(r1, r2, r3, r5, r6, r7, r8), [r0]+
-		ldw	sp, [r0+], #28
-		sub.a	r0, r0, r1		@ calculate the delta offset
-
-		/*
-		 * if delta is zero, we are running at the address
-		 * we were linked at.
-		 */
-		beq	not_relocated
-
-		/*
-		 * We're running at a different address.  We need to fix
-		 * up various pointers:
-		 *   r5 - zImage base address (_start)
-		 *   r7 - GOT start
-		 *   r8 - GOT end
-		 */
-		add	r5, r5, r0
-		add	r7, r7, r0
-		add	r8, r8, r0
-
-		/*
-		 * we need to fix up pointers into the BSS region.
-		 *   r2 - BSS start
-		 *   r3 - BSS end
-		 *   sp - stack pointer
-		 */
-		add	r2, r2, r0
-		add	r3, r3, r0
-		add	sp, sp, r0
-
-		/*
-		 * Relocate all entries in the GOT table.
-		 * This fixes up the C references.
-		 *   r7 - GOT start
-		 *   r8 - GOT end
-		 */
-1001:		ldw	r1, [r7+], #0
-		add	r1, r1, r0
-		stw.w	r1, [r7]+, #4
-		csub.a	r7, r8
-		bub	1001b
-
-not_relocated:
-		/*
-		 * Clear BSS region.
-		 *   r2 - BSS start
-		 *   r3 - BSS end
-		 */
-		mov	r0, #0
-1002:		stw.w	r0, [r2]+, #4
-		csub.a	r2, r3
-		bub	1002b
-
-		/*
-		 * Turn on the cache.
-		 */
-                mov     r0, #0
-                movc    p0.c5, r0, #28		@ cache invalidate all
-                nop8
-                movc    p0.c6, r0, #6		@ tlb invalidate all
-                nop8
-
-                mov     r0, #0x1c		@ en icache and wb dcache
-                movc    p0.c1, r0, #0
-                nop8
-
-		/*
-		 * Set up some pointers, for starting decompressing.
-		 */
-
-		mov	r1, sp			@ malloc space above stack
-		add	r2, sp, #0x10000	@ 64k max
-
-		/*
-		 * Check to see if we will overwrite ourselves.
-		 *   r4 = final kernel address
-		 *   r5 = start of this image
-		 *   r6 = size of decompressed image
-		 *   r2 = end of malloc space (and therefore this image)
-		 * We basically want:
-		 *   r4 >= r2 -> OK
-		 *   r4 + image length <= r5 -> OK
-		 */
-		ldw	r4, =KERNEL_IMAGE_START
-		csub.a	r4, r2
-		bea	wont_overwrite
-		add	r0, r4, r6
-		csub.a	r0, r5
-		beb	wont_overwrite
-
-		/*
-		 * If overwrite, just print error message
-		 */
-		b	__error_overwrite
-
-		/*
-		 * We're not in danger of overwriting ourselves.
-		 * Do this the simple way.
-		 */
-wont_overwrite:
-		/*
-		 * decompress_kernel:
-		 *   r0: output_start
-		 *   r1: free_mem_ptr_p
-		 *   r2: free_mem_ptr_end_p
-		 */
-		mov	r0, r4
-		b.l	decompress_kernel	@ C functions
-
-		/*
-		 * Clean and flush the cache to maintain consistency.
-		 */
-		mov	r0, #0
-                movc    p0.c5, r0, #14		@ flush dcache
-		nop8
-                movc    p0.c5, r0, #20		@ icache invalidate all
-                nop8
-
-		/*
-		 * Turn off the Cache and MMU.
-		 */
-		mov	r0, #0			@ disable i/d cache and MMU
-		movc	p0.c1, r0, #0
-                nop8
-
-		mov	r0, #0			@ must be zero
-		ldw	r4, =KERNEL_IMAGE_START
-		mov	pc, r4			@ call kernel
-
-
-		.align	2
-		.type	LC0, #object
-LC0:		.word	LC0			@ r1
-		.word	__bss_start		@ r2
-		.word	_end			@ r3
-		.word	_start			@ r5
-		.word	_image_size		@ r6
-		.word	_got_start		@ r7
-		.word	_got_end		@ r8
-		.word	decompress_stack_end	@ sp
-		.size	LC0, . - LC0
-
-print_string:
-#ifdef CONFIG_DEBUG_OCD
-2001:		ldb.w	r1, [r0]+, #1
-		csub.a	r1, #0
-		bne	2002f
-		mov	pc, lr
-2002:
-		movc	r2, p1.c0, #0
-		cand.a	r2, #2
-		bne	2002b
-		movc	p1.c1, r1, #1
-		csub.a	r1, #'\n'
-		cmoveq	r1, #'\r'
-		beq	2002b
-		b	2001b
-#else
-		mov	pc, lr
-#endif
-
-__error_overwrite:
-		adr	r0, str_error
-		b.l	print_string
-2001:		nop8
-		b	2001b
-str_error:	.asciz	"\nError: Kernel address OVERWRITE\n"
-		.align
-
-		.ltorg
-
-		.align	4
-		.section ".stack", "aw", %nobits
-decompress_stack:	.space	4096
-decompress_stack_end:
diff --git a/arch/unicore32/boot/compressed/misc.c b/arch/unicore32/boot/compressed/misc.c
deleted file mode 100644
index 450d3355de20..000000000000
--- a/arch/unicore32/boot/compressed/misc.c
+++ /dev/null
@@ -1,123 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * linux/arch/unicore32/boot/compressed/misc.c
- *
- * Code specific to PKUnity SoC and UniCore ISA
- *
- * Copyright (C) 2001-2010 GUAN Xue-tao
- */
-
-#include <asm/unaligned.h>
-#include <mach/uncompress.h>
-
-/*
- * gzip delarations
- */
-unsigned char *output_data;
-unsigned long output_ptr;
-
-unsigned int free_mem_ptr;
-unsigned int free_mem_end_ptr;
-
-#define STATIC static
-#define STATIC_RW_DATA	/* non-static please */
-
-/*
- * arch-dependent implementations
- */
-#ifndef ARCH_HAVE_DECOMP_ERROR
-#define arch_decomp_error(x)
-#endif
-
-#ifndef ARCH_HAVE_DECOMP_SETUP
-#define arch_decomp_setup()
-#endif
-
-#ifndef ARCH_HAVE_DECOMP_PUTS
-#define arch_decomp_puts(p)
-#endif
-
-void *memcpy(void *dest, const void *src, size_t n)
-{
-	int i = 0;
-	unsigned char *d = (unsigned char *)dest, *s = (unsigned char *)src;
-
-	for (i = n >> 3; i > 0; i--) {
-		*d++ = *s++;
-		*d++ = *s++;
-		*d++ = *s++;
-		*d++ = *s++;
-		*d++ = *s++;
-		*d++ = *s++;
-		*d++ = *s++;
-		*d++ = *s++;
-	}
-
-	if (n & 1 << 2) {
-		*d++ = *s++;
-		*d++ = *s++;
-		*d++ = *s++;
-		*d++ = *s++;
-	}
-
-	if (n & 1 << 1) {
-		*d++ = *s++;
-		*d++ = *s++;
-	}
-
-	if (n & 1)
-		*d++ = *s++;
-
-	return dest;
-}
-
-void error(char *x)
-{
-	arch_decomp_puts("\n\n");
-	arch_decomp_puts(x);
-	arch_decomp_puts("\n\n -- System halted");
-
-	arch_decomp_error(x);
-
-	for (;;)
-		; /* Halt */
-}
-
-/* Heap size should be adjusted for different decompress method */
-#ifdef CONFIG_KERNEL_GZIP
-#include "../../../../lib/decompress_inflate.c"
-#endif
-
-#ifdef CONFIG_KERNEL_BZIP2
-#include "../../../../lib/decompress_bunzip2.c"
-#endif
-
-#ifdef CONFIG_KERNEL_LZO
-#include "../../../../lib/decompress_unlzo.c"
-#endif
-
-#ifdef CONFIG_KERNEL_LZMA
-#include "../../../../lib/decompress_unlzma.c"
-#endif
-
-unsigned long decompress_kernel(unsigned long output_start,
-		unsigned long free_mem_ptr_p,
-		unsigned long free_mem_ptr_end_p)
-{
-	unsigned char *tmp;
-
-	output_data		= (unsigned char *)output_start;
-	free_mem_ptr		= free_mem_ptr_p;
-	free_mem_end_ptr	= free_mem_ptr_end_p;
-
-	arch_decomp_setup();
-
-	tmp = (unsigned char *) (((unsigned long)input_data_end) - 4);
-	output_ptr = get_unaligned_le32(tmp);
-
-	arch_decomp_puts("Uncompressing Linux...");
-	__decompress(input_data, input_data_end - input_data, NULL, NULL,
-			output_data, 0, NULL, error);
-	arch_decomp_puts(" done, booting the kernel.\n");
-	return output_ptr;
-}
diff --git a/arch/unicore32/boot/compressed/piggy.S.in b/arch/unicore32/boot/compressed/piggy.S.in
deleted file mode 100644
index b79704d58026..000000000000
--- a/arch/unicore32/boot/compressed/piggy.S.in
+++ /dev/null
@@ -1,6 +0,0 @@
-	.section .piggydata,#alloc
-	.globl	input_data
-input_data:
-	.incbin	"arch/unicore32/boot/compressed/piggy.DECOMP_SUFFIX"
-	.globl	input_data_end
-input_data_end:
diff --git a/arch/unicore32/boot/compressed/vmlinux.lds.S b/arch/unicore32/boot/compressed/vmlinux.lds.S
deleted file mode 100644
index edda4ddfa357..000000000000
--- a/arch/unicore32/boot/compressed/vmlinux.lds.S
+++ /dev/null
@@ -1,58 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/*
- * linux/arch/unicore/boot/compressed/vmlinux.lds.in
- *
- * Code specific to PKUnity SoC and UniCore ISA
- *
- * Copyright (C) 2001-2010 GUAN Xue-tao
- */
-OUTPUT_ARCH(unicore32)
-ENTRY(_start)
-SECTIONS
-{
-  /DISCARD/ : {
-    /*
-     * Discard any r/w data - this produces a link error if we have any,
-     * which is required for PIC decompression.  Local data generates
-     * GOTOFF relocations, which prevents it being relocated independently
-     * of the text/got segments.
-     */
-    *(.data)
-  }
-
-  . = TEXT_START;
-  _text = .;
-
-  .text : {
-    _start = .;
-    *(.start)
-    *(.text)
-    *(.text.*)
-    *(.fixup)
-    *(.gnu.warning)
-    *(.rodata)
-    *(.rodata.*)
-    *(.piggydata)
-    . = ALIGN(4);
-  }
-
-  _etext = .;
-
-  /* Assume size of decompressed image is 4x the compressed image */
-  _image_size = (_etext - _text) * 4;
-
-  _got_start = .;
-  .got			: { *(.got) }
-  _got_end = .;
-  .got.plt		: { *(.got.plt) }
-  _edata = .;
-
-  . = BSS_START;
-  __bss_start = .;
-  .bss			: { *(.bss) }
-  _end = .;
-
-  .stack		: { *(.stack) }
-  .comment 0		: { *(.comment) }
-}
-
diff --git a/arch/unicore32/configs/defconfig b/arch/unicore32/configs/defconfig
deleted file mode 100644
index 360cc9abcdb0..000000000000
--- a/arch/unicore32/configs/defconfig
+++ /dev/null
@@ -1,214 +0,0 @@
-### General setup
-CONFIG_EXPERIMENTAL=y
-CONFIG_LOCALVERSION="-unicore32"
-CONFIG_SWAP=y
-CONFIG_SYSVIPC=y
-CONFIG_POSIX_MQUEUE=y
-CONFIG_HOTPLUG=y
-#	Initial RAM filesystem and RAM disk (initramfs/initrd) support
-#CONFIG_BLK_DEV_INITRD=y
-#CONFIG_INITRAMFS_SOURCE="arch/unicore/ramfs/ramfs_config"
-
-### Enable loadable module support
-CONFIG_MODULES=n
-CONFIG_MODULE_UNLOAD=y
-
-### System Type
-CONFIG_ARCH_PUV3=y
-#	Board Selection
-CONFIG_PUV3_NB0916=y
-#	Processor Features
-CONFIG_CPU_DCACHE_LINE_DISABLE=y
-CONFIG_CPU_TLB_SINGLE_ENTRY_DISABLE=n
-
-### Bus support
-CONFIG_PCI=y
-CONFIG_PCI_LEGACY=n
-
-### Boot options
-#	for debug, adding: earlyprintk=ocd,keep initcall_debug
-#	others support: test_suspend=mem root=/dev/sda
-#	hibernate support: resume=/dev/sda3
-CONFIG_CMDLINE="earlyprintk=ocd,keep ignore_loglevel"
-# TODO: mem=512M video=unifb:1024x600-16@75
-# for nfs: root=/dev/nfs rw nfsroot=192.168.10.88:/home/udb/nfs/,rsize=1024,wsize=1024
-#	ip=192.168.10.83:192.168.10.88:192.168.10.1:255.255.255.0::eth0:off
-CONFIG_CMDLINE_FORCE=y
-
-### Power management options
-CONFIG_PM=y
-CONFIG_HIBERNATION=y
-CONFIG_PM_STD_PARTITION="/dev/sda3"
-CONFIG_CPU_FREQ=n
-CONFIG_CPU_FREQ_DEFAULT_GOV_USERSPACE=y
-
-### Networking support
-CONFIG_NET=y
-#	Networking options
-CONFIG_PACKET=m
-CONFIG_UNIX=m
-#	TCP/IP networking
-CONFIG_INET=y
-CONFIG_IP_MULTICAST=y
-CONFIG_IP_PNP=y
-CONFIG_IPV6=n
-#	Wireless
-CONFIG_WIRELESS=y
-CONFIG_WIRELESS_EXT=y
-CONFIG_MAC80211=m
-
-### PKUnity SoC Features
-CONFIG_USB_WLAN_HED_AQ3=n
-CONFIG_USB_CMMB_INNOFIDEI=n
-CONFIG_I2C_BATTERY_BQ27200=n
-CONFIG_I2C_EEPROM_AT24=n
-CONFIG_LCD_BACKLIGHT=n
-
-CONFIG_PUV3_UMAL=y
-CONFIG_PUV3_MUSB=n
-CONFIG_PUV3_AC97=n
-CONFIG_PUV3_NAND=n
-CONFIG_PUV3_MMC=n
-CONFIG_PUV3_UART=n
-
-### Device Drivers
-#	Memory Technology Device (MTD) support
-CONFIG_MTD=m
-CONFIG_MTD_UBI=m
-CONFIG_MTD_PARTITIONS=y
-CONFIG_MTD_CHAR=m
-CONFIG_MTD_BLKDEVS=m
-#	RAM/ROM/Flash chip drivers
-CONFIG_MTD_CFI=m
-CONFIG_MTD_JEDECPROBE=m
-CONFIG_MTD_CFI_AMDSTD=m
-#	Mapping drivers for chip access
-CONFIG_MTD_PHYSMAP=m
-
-#	Block devices
-CONFIG_BLK_DEV_LOOP=m
-
-#	SCSI device support
-CONFIG_SCSI=y
-CONFIG_BLK_DEV_SD=y
-CONFIG_BLK_DEV_SR=m
-CONFIG_CHR_DEV_SG=m
-
-#	Serial ATA (prod) and Parallel ATA (experimental) drivers
-CONFIG_ATA=y
-CONFIG_SATA_VIA=y
-
-#	Network device support
-CONFIG_NETDEVICES=y
-CONFIG_NET_ETHERNET=y
-CONFIG_NETDEV_1000=y
-#	Wireless LAN
-CONFIG_WLAN_80211=n
-CONFIG_RT2X00=n
-CONFIG_RT73USB=n
-
-#	Input device support
-CONFIG_INPUT_EVDEV=m
-#	Keyboards
-CONFIG_KEYBOARD_GPIO=m
-
-#	I2C support
-CONFIG_I2C=y
-CONFIG_I2C_PUV3=y
-
-#	Hardware Monitoring support
-#CONFIG_SENSORS_LM75=m
-#	Generic Thermal sysfs driver
-#CONFIG_THERMAL=y
-#CONFIG_THERMAL_HWMON=y
-
-#	Multimedia support
-CONFIG_MEDIA_SUPPORT=n
-CONFIG_VIDEO_DEV=n
-CONFIG_USB_VIDEO_CLASS=n
-
-#	Graphics support
-CONFIG_FB=y
-CONFIG_FB_PUV3_UNIGFX=y
-#	Console display driver support
-CONFIG_VGA_CONSOLE=n
-CONFIG_FRAMEBUFFER_CONSOLE=y
-CONFIG_FONTS=y
-CONFIG_FONT_8x8=y
-CONFIG_FONT_8x16=y
-#	Bootup logo
-CONFIG_LOGO=n
-
-#	Sound card support
-CONFIG_SOUND=m
-#	Advanced Linux Sound Architecture
-CONFIG_SND=m
-CONFIG_SND_MIXER_OSS=m
-CONFIG_SND_PCM_OSS=m
-
-#	USB support
-CONFIG_USB_ARCH_HAS_HCD=n
-CONFIG_USB=n
-CONFIG_USB_PRINTER=n
-CONFIG_USB_STORAGE=n
-#	Inventra Highspeed Dual Role Controller
-CONFIG_USB_MUSB_HDRC=n
-
-#	LED Support
-CONFIG_NEW_LEDS=y
-CONFIG_LEDS_CLASS=y
-CONFIG_LEDS_GPIO=y
-#	LED Triggers
-CONFIG_LEDS_TRIGGERS=y
-CONFIG_LEDS_TRIGGER_TIMER=y
-CONFIG_LEDS_TRIGGER_DISK=y
-CONFIG_LEDS_TRIGGER_HEARTBEAT=y
-
-#	Real Time Clock
-CONFIG_RTC_LIB=y
-CONFIG_RTC_CLASS=y
-CONFIG_RTC_DRV_PUV3=y
-
-### File systems
-CONFIG_EXT2_FS=m
-CONFIG_EXT3_FS=y
-CONFIG_EXT4_FS=y
-CONFIG_FUSE_FS=m
-#	CD-ROM/DVD Filesystems
-CONFIG_ISO9660_FS=m
-CONFIG_JOLIET=y
-CONFIG_UDF_FS=m
-#	DOS/FAT/NT Filesystems
-CONFIG_VFAT_FS=m
-#	Pseudo filesystems
-CONFIG_PROC_FS=y
-CONFIG_SYSFS=y
-CONFIG_TMPFS=y
-#	Miscellaneous filesystems
-CONFIG_MISC_FILESYSTEMS=y
-CONFIG_JFFS2_FS=m
-CONFIG_UBIFS_FS=m
-#	Network File Systems
-CONFIG_NETWORK_FILESYSTEMS=y
-CONFIG_NFS_FS=y
-CONFIG_NFS_V3=y
-CONFIG_ROOT_NFS=y
-#	Partition Types
-CONFIG_PARTITION_ADVANCED=y
-CONFIG_MSDOS_PARTITION=y
-#	Native language support
-CONFIG_NLS=y
-CONFIG_NLS_CODEPAGE_437=m
-CONFIG_NLS_CODEPAGE_936=m
-CONFIG_NLS_ISO8859_1=m
-CONFIG_NLS_UTF8=m
-
-### Kernel hacking
-CONFIG_FRAME_WARN=8096
-CONFIG_MAGIC_SYSRQ=y
-CONFIG_DEBUG_KERNEL=y
-CONFIG_PROVE_LOCKING=n
-CONFIG_DEBUG_BUGVERBOSE=y
-CONFIG_FRAME_POINTER=y
-CONFIG_DEBUG_LL=y
-
diff --git a/arch/unicore32/include/asm/Kbuild b/arch/unicore32/include/asm/Kbuild
deleted file mode 100644
index 55026e8240d8..000000000000
--- a/arch/unicore32/include/asm/Kbuild
+++ /dev/null
@@ -1,7 +0,0 @@
-# SPDX-License-Identifier: GPL-2.0
-generic-y += extable.h
-generic-y += kvm_para.h
-generic-y += mcs_spinlock.h
-generic-y += parport.h
-generic-y += syscalls.h
-generic-y += user.h
diff --git a/arch/unicore32/include/asm/assembler.h b/arch/unicore32/include/asm/assembler.h
deleted file mode 100644
index 3de843d92850..000000000000
--- a/arch/unicore32/include/asm/assembler.h
+++ /dev/null
@@ -1,128 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/*
- * linux/arch/unicore32/include/asm/assembler.h
- *
- * Code specific to PKUnity SoC and UniCore ISA
- *
- * Copyright (C) 2001-2010 GUAN Xue-tao
- *
- *  Do not include any C declarations in this file - it is included by
- *  assembler source.
- */
-#ifndef __ASSEMBLY__
-#error "Only include this from assembly code"
-#endif
-
-#include <asm/ptrace.h>
-
-/*
- * Little Endian independent macros for shifting bytes within registers.
- */
-#define pull            >>
-#define push            <<
-#define get_byte_0      << #0
-#define get_byte_1	>> #8
-#define get_byte_2	>> #16
-#define get_byte_3	>> #24
-#define put_byte_0      << #0
-#define put_byte_1	<< #8
-#define put_byte_2	<< #16
-#define put_byte_3	<< #24
-
-#define cadd		cmpadd
-#define cand		cmpand
-#define csub		cmpsub
-#define cxor		cmpxor
-
-/*
- * Enable and disable interrupts
- */
-	.macro disable_irq, temp
-	mov	\temp, asr
-	andn     \temp, \temp, #0xFF
-	or	\temp, \temp, #PSR_I_BIT | PRIV_MODE
-	mov.a	asr, \temp
-	.endm
-
-	.macro enable_irq, temp
-	mov	\temp, asr
-	andn     \temp, \temp, #0xFF
-	or	\temp, \temp, #PRIV_MODE
-	mov.a	asr, \temp
-	.endm
-
-#define USER(x...)				\
-9999:	x;					\
-	.pushsection __ex_table, "a";		\
-	.align	3;				\
-	.long	9999b, 9001f;			\
-	.popsection
-
-	.macro	notcond, cond, nexti = .+8
-	.ifc	\cond, eq
-		bne	\nexti
-	.else;	.ifc	\cond, ne
-		beq	\nexti
-	.else;	.ifc	\cond, ea
-		bub	\nexti
-	.else;	.ifc	\cond, ub
-		bea	\nexti
-	.else;	.ifc	\cond, fs
-		bns	\nexti
-	.else;	.ifc	\cond, ns
-		bfs	\nexti
-	.else;	.ifc	\cond, fv
-		bnv	\nexti
-	.else;	.ifc	\cond, nv
-		bfv	\nexti
-	.else;	.ifc	\cond, ua
-		beb	\nexti
-	.else;	.ifc	\cond, eb
-		bua	\nexti
-	.else;	.ifc	\cond, eg
-		bsl	\nexti
-	.else;	.ifc	\cond, sl
-		beg	\nexti
-	.else;	.ifc	\cond, sg
-		bel	\nexti
-	.else;	.ifc	\cond, el
-		bsg	\nexti
-	.else;	.ifnc	\cond, al
-		.error  "Unknown cond in notcond macro argument"
-	.endif;	.endif;	.endif;	.endif;	.endif;	.endif;	.endif
-	.endif;	.endif;	.endif;	.endif;	.endif;	.endif;	.endif
-	.endif
-	.endm
-
-	.macro	usracc, instr, reg, ptr, inc, cond, rept, abort
-	.rept	\rept
-	notcond	\cond, .+8
-9999 :
-	.if	\inc == 1
-	\instr\()b.u \reg, [\ptr], #\inc
-	.elseif	\inc == 4
-	\instr\()w.u \reg, [\ptr], #\inc
-	.else
-	.error	"Unsupported inc macro argument"
-	.endif
-
-	.pushsection __ex_table, "a"
-	.align	3
-	.long	9999b, \abort
-	.popsection
-	.endr
-	.endm
-
-	.macro	strusr, reg, ptr, inc, cond = al, rept = 1, abort = 9001f
-	usracc	st, \reg, \ptr, \inc, \cond, \rept, \abort
-	.endm
-
-	.macro	ldrusr, reg, ptr, inc, cond = al, rept = 1, abort = 9001f
-	usracc	ld, \reg, \ptr, \inc, \cond, \rept, \abort
-	.endm
-
-	.macro	nop8
-	.rept	8
-		nop
-	.endr
-	.endm
diff --git a/arch/unicore32/include/asm/barrier.h b/arch/unicore32/include/asm/barrier.h
deleted file mode 100644
index efb81de87507..000000000000
--- a/arch/unicore32/include/asm/barrier.h
+++ /dev/null
@@ -1,16 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/*
- * Memory barrier implementations for PKUnity SoC and UniCore ISA
- *
- * Copyright (C) 2001-2012 GUAN Xue-tao
- */
-#ifndef __UNICORE_BARRIER_H__
-#define __UNICORE_BARRIER_H__
-
-#define isb() __asm__ __volatile__ ("" : : : "memory")
-#define dsb() __asm__ __volatile__ ("" : : : "memory")
-#define dmb() __asm__ __volatile__ ("" : : : "memory")
-
-#include <asm-generic/barrier.h>
-
-#endif /* __UNICORE_BARRIER_H__ */
diff --git a/arch/unicore32/include/asm/bitops.h b/arch/unicore32/include/asm/bitops.h
deleted file mode 100644
index deeb2163f35e..000000000000
--- a/arch/unicore32/include/asm/bitops.h
+++ /dev/null
@@ -1,46 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/*
- * linux/arch/unicore32/include/asm/bitops.h
- *
- * Code specific to PKUnity SoC and UniCore ISA
- *
- * Copyright (C) 2001-2010 GUAN Xue-tao
- */
-
-#ifndef __UNICORE_BITOPS_H__
-#define __UNICORE_BITOPS_H__
-
-#define _ASM_GENERIC_BITOPS_FLS_H_
-#define _ASM_GENERIC_BITOPS___FLS_H_
-#define _ASM_GENERIC_BITOPS_FFS_H_
-#define _ASM_GENERIC_BITOPS___FFS_H_
-/*
- * On UNICORE, those functions can be implemented around
- * the cntlz instruction for much better code efficiency.
- */
-
-static inline int fls(unsigned int x)
-{
-	int ret;
-
-	asm("cntlz\t%0, %1" : "=r" (ret) : "r" (x) : "cc");
-	ret = 32 - ret;
-
-	return ret;
-}
-
-#define __fls(x) (fls(x) - 1)
-#define ffs(x) ({ unsigned long __t = (x); fls(__t & -__t); })
-#define __ffs(x) (ffs(x) - 1)
-
-#include <asm-generic/bitops.h>
-
-/* following definitions: to avoid using codes in lib/find_*.c */
-#define find_next_bit		find_next_bit
-#define find_next_zero_bit	find_next_zero_bit
-#define find_first_bit		find_first_bit
-#define find_first_zero_bit	find_first_zero_bit
-
-#include <asm-generic/bitops/find.h>
-
-#endif /* __UNICORE_BITOPS_H__ */
diff --git a/arch/unicore32/include/asm/bug.h b/arch/unicore32/include/asm/bug.h
deleted file mode 100644
index 99acea84a865..000000000000
--- a/arch/unicore32/include/asm/bug.h
+++ /dev/null
@@ -1,20 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/*
- * Bug handling for PKUnity SoC and UniCore ISA
- *
- * Copyright (C) 2001-2012 GUAN Xue-tao
- */
-#ifndef __UNICORE_BUG_H__
-#define __UNICORE_BUG_H__
-
-#include <asm-generic/bug.h>
-
-struct pt_regs;
-struct siginfo;
-
-extern void die(const char *msg, struct pt_regs *regs, int err);
-extern void uc32_notify_die(const char *str, struct pt_regs *regs,
-		int sig, int code, void __user *addr,
-		unsigned long err, unsigned long trap);
-
-#endif /* __UNICORE_BUG_H__ */
diff --git a/arch/unicore32/include/asm/cache.h b/arch/unicore32/include/asm/cache.h
deleted file mode 100644
index 44ecd1f300fe..000000000000
--- a/arch/unicore32/include/asm/cache.h
+++ /dev/null
@@ -1,24 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/*
- * linux/arch/unicore32/include/asm/cache.h
- *
- * Code specific to PKUnity SoC and UniCore ISA
- *
- * Copyright (C) 2001-2010 GUAN Xue-tao
- */
-#ifndef __UNICORE_CACHE_H__
-#define __UNICORE_CACHE_H__
-
-#define L1_CACHE_SHIFT		(5)
-#define L1_CACHE_BYTES		(1 << L1_CACHE_SHIFT)
-
-/*
- * Memory returned by kmalloc() may be used for DMA, so we must make
- * sure that all such allocations are cache aligned. Otherwise,
- * unrelated code may cause parts of the buffer to be read into the
- * cache before the transfer is done, causing old data to be seen by
- * the CPU.
- */
-#define ARCH_DMA_MINALIGN	L1_CACHE_BYTES
-
-#endif
diff --git a/arch/unicore32/include/asm/cacheflush.h b/arch/unicore32/include/asm/cacheflush.h
deleted file mode 100644
index ff0be92ebc32..000000000000
--- a/arch/unicore32/include/asm/cacheflush.h
+++ /dev/null
@@ -1,186 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/*
- * linux/arch/unicore32/include/asm/cacheflush.h
- *
- * Code specific to PKUnity SoC and UniCore ISA
- *
- * Copyright (C) 2001-2010 GUAN Xue-tao
- */
-#ifndef __UNICORE_CACHEFLUSH_H__
-#define __UNICORE_CACHEFLUSH_H__
-
-#include <linux/mm.h>
-
-#include <asm/shmparam.h>
-
-#define CACHE_COLOUR(vaddr)	((vaddr & (SHMLBA - 1)) >> PAGE_SHIFT)
-
-/*
- * This flag is used to indicate that the page pointed to by a pte is clean
- * and does not require cleaning before returning it to the user.
- */
-#define PG_dcache_clean PG_arch_1
-
-/*
- *	MM Cache Management
- *	===================
- *
- *	The arch/unicore32/mm/cache.S files implement these methods.
- *
- *	Start addresses are inclusive and end addresses are exclusive;
- *	start addresses should be rounded down, end addresses up.
- *
- *	See Documentation/core-api/cachetlb.rst for more information.
- *	Please note that the implementation of these, and the required
- *	effects are cache-type (VIVT/VIPT/PIPT) specific.
- *
- *	flush_icache_all()
- *
- *		Unconditionally clean and invalidate the entire icache.
- *		Currently only needed for cache-v6.S and cache-v7.S, see
- *		__flush_icache_all for the generic implementation.
- *
- *	flush_kern_all()
- *
- *		Unconditionally clean and invalidate the entire cache.
- *
- *	flush_user_all()
- *
- *		Clean and invalidate all user space cache entries
- *		before a change of page tables.
- *
- *	flush_user_range(start, end, flags)
- *
- *		Clean and invalidate a range of cache entries in the
- *		specified address space before a change of page tables.
- *		- start - user start address (inclusive, page aligned)
- *		- end   - user end address   (exclusive, page aligned)
- *		- flags - vma->vm_flags field
- *
- *	coherent_kern_range(start, end)
- *
- *		Ensure coherency between the Icache and the Dcache in the
- *		region described by start, end.  If you have non-snooping
- *		Harvard caches, you need to implement this function.
- *		- start  - virtual start address
- *		- end    - virtual end address
- *
- *	coherent_user_range(start, end)
- *
- *		Ensure coherency between the Icache and the Dcache in the
- *		region described by start, end.  If you have non-snooping
- *		Harvard caches, you need to implement this function.
- *		- start  - virtual start address
- *		- end    - virtual end address
- *
- *	flush_kern_dcache_area(kaddr, size)
- *
- *		Ensure that the data held in page is written back.
- *		- kaddr  - page address
- *		- size   - region size
- *
- *	DMA Cache Coherency
- *	===================
- *
- *	dma_flush_range(start, end)
- *
- *		Clean and invalidate the specified virtual address range.
- *		- start  - virtual start address
- *		- end    - virtual end address
- */
-
-extern void __cpuc_flush_icache_all(void);
-extern void __cpuc_flush_kern_all(void);
-extern void __cpuc_flush_user_all(void);
-extern void __cpuc_flush_user_range(unsigned long, unsigned long, unsigned int);
-extern void __cpuc_coherent_kern_range(unsigned long, unsigned long);
-extern void __cpuc_coherent_user_range(unsigned long, unsigned long);
-extern void __cpuc_flush_dcache_area(void *, size_t);
-extern void __cpuc_flush_kern_dcache_area(void *addr, size_t size);
-
-/*
- * Copy user data from/to a page which is mapped into a different
- * processes address space.  Really, we want to allow our "user
- * space" model to handle this.
- */
-extern void copy_to_user_page(struct vm_area_struct *, struct page *,
-	unsigned long, void *, const void *, unsigned long);
-#define copy_from_user_page(vma, page, vaddr, dst, src, len)	\
-	do {							\
-		memcpy(dst, src, len);				\
-	} while (0)
-
-/*
- * Convert calls to our calling convention.
- */
-/* Invalidate I-cache */
-static inline void __flush_icache_all(void)
-{
-	asm("movc	p0.c5, %0, #20;\n"
-	    "nop; nop; nop; nop; nop; nop; nop; nop\n"
-	    :
-	    : "r" (0));
-}
-
-#define flush_cache_all()		__cpuc_flush_kern_all()
-
-extern void flush_cache_mm(struct mm_struct *mm);
-extern void flush_cache_range(struct vm_area_struct *vma,
-		unsigned long start, unsigned long end);
-extern void flush_cache_page(struct vm_area_struct *vma,
-		unsigned long user_addr, unsigned long pfn);
-
-#define flush_cache_dup_mm(mm) flush_cache_mm(mm)
-
-/*
- * Perform necessary cache operations to ensure that data previously
- * stored within this range of addresses can be executed by the CPU.
- */
-#define flush_icache_range(s, e)	__cpuc_coherent_kern_range(s, e)
-
-/*
- * Perform necessary cache operations to ensure that the TLB will
- * see data written in the specified area.
- */
-#define clean_dcache_area(start, size)	cpu_dcache_clean_area(start, size)
-
-/*
- * flush_dcache_page is used when the kernel has written to the page
- * cache page at virtual address page->virtual.
- *
- * If this page isn't mapped (ie, page_mapping == NULL), or it might
- * have userspace mappings, then we _must_ always clean + invalidate
- * the dcache entries associated with the kernel mapping.
- *
- * Otherwise we can defer the operation, and clean the cache when we are
- * about to change to user space.  This is the same method as used on SPARC64.
- * See update_mmu_cache for the user space part.
- */
-#define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE 1
-extern void flush_dcache_page(struct page *);
-
-#define flush_dcache_mmap_lock(mapping)		do { } while (0)
-#define flush_dcache_mmap_unlock(mapping)	do { } while (0)
-
-/*
- * We don't appear to need to do anything here.  In fact, if we did, we'd
- * duplicate cache flushing elsewhere performed by flush_dcache_page().
- */
-#define flush_icache_page(vma, page)	do { } while (0)
-
-/*
- * flush_cache_vmap() is used when creating mappings (eg, via vmap,
- * vmalloc, ioremap etc) in kernel space for pages.  On non-VIPT
- * caches, since the direct-mappings of these pages may contain cached
- * data, we need to do a full cache flush to ensure that writebacks
- * don't corrupt data placed into these pages via the new mappings.
- */
-static inline void flush_cache_vmap(unsigned long start, unsigned long end)
-{
-}
-
-static inline void flush_cache_vunmap(unsigned long start, unsigned long end)
-{
-}
-
-#endif
diff --git a/arch/unicore32/include/asm/checksum.h b/arch/unicore32/include/asm/checksum.h
deleted file mode 100644
index e774ca268c15..000000000000
--- a/arch/unicore32/include/asm/checksum.h
+++ /dev/null
@@ -1,38 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/*
- * linux/arch/unicore32/include/asm/checksum.h
- *
- * Code specific to PKUnity SoC and UniCore ISA
- *
- * Copyright (C) 2001-2010 GUAN Xue-tao
- *
- * IP checksum routines
- */
-#ifndef __UNICORE_CHECKSUM_H__
-#define __UNICORE_CHECKSUM_H__
-
-/*
- * computes the checksum of the TCP/UDP pseudo-header
- * returns a 16-bit checksum, already complemented
- */
-
-static inline __wsum
-csum_tcpudp_nofold(__be32 saddr, __be32 daddr, __u32 len,
-		   __u8 proto, __wsum sum)
-{
-	__asm__(
-	"add.a	%0, %1, %2\n"
-	"addc.a	%0, %0, %3\n"
-	"addc.a	%0, %0, %4 << #8\n"
-	"addc.a	%0, %0, %5\n"
-	"addc	%0, %0, #0\n"
-	: "=&r"(sum)
-	: "r" (sum), "r" (daddr), "r" (saddr), "r" (len), "Ir" (htons(proto))
-	: "cc");
-	return sum;
-}
-#define csum_tcpudp_nofold	csum_tcpudp_nofold
-
-#include <asm-generic/checksum.h>
-
-#endif
diff --git a/arch/unicore32/include/asm/cmpxchg.h b/arch/unicore32/include/asm/cmpxchg.h
deleted file mode 100644
index 87f960a2e4f0..000000000000
--- a/arch/unicore32/include/asm/cmpxchg.h
+++ /dev/null
@@ -1,58 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/*
- * Atomics xchg/cmpxchg for PKUnity SoC and UniCore ISA
- *
- * Copyright (C) 2001-2012 GUAN Xue-tao
- */
-#ifndef __UNICORE_CMPXCHG_H__
-#define __UNICORE_CMPXCHG_H__
-
-/*
- * Generate a link failure on undefined symbol if the pointer points to a value
- * of unsupported size.
- */
-extern void __xchg_bad_pointer(void);
-
-static inline unsigned long __xchg(unsigned long x, volatile void *ptr,
-		int size)
-{
-	unsigned long ret;
-
-	switch (size) {
-	case 1:
-		asm volatile("swapb	%0, %1, [%2]"
-			: "=&r" (ret)
-			: "r" (x), "r" (ptr)
-			: "memory", "cc");
-		break;
-	case 4:
-		asm volatile("swapw	%0, %1, [%2]"
-			: "=&r" (ret)
-			: "r" (x), "r" (ptr)
-			: "memory", "cc");
-		break;
-	default:
-		__xchg_bad_pointer();
-	}
-
-	return ret;
-}
-
-#define xchg(ptr, x) \
-	((__typeof__(*(ptr)))__xchg((unsigned long)(x), (ptr), sizeof(*(ptr))))
-
-#include <asm-generic/cmpxchg-local.h>
-
-/*
- * cmpxchg_local and cmpxchg64_local are atomic wrt current CPU. Always make
- * them available.
- */
-#define cmpxchg_local(ptr, o, n)					\
-		((__typeof__(*(ptr)))__cmpxchg_local_generic((ptr),	\
-		(unsigned long)(o), (unsigned long)(n), sizeof(*(ptr))))
-#define cmpxchg64_local(ptr, o, n)					\
-		__cmpxchg64_local_generic((ptr), (o), (n))
-
-#include <asm-generic/cmpxchg.h>
-
-#endif /* __UNICORE_CMPXCHG_H__ */
diff --git a/arch/unicore32/include/asm/cpu-single.h b/arch/unicore32/include/asm/cpu-single.h
deleted file mode 100644
index 1b419d697fd1..000000000000
--- a/arch/unicore32/include/asm/cpu-single.h
+++ /dev/null
@@ -1,42 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/*
- * linux/arch/unicore32/include/asm/cpu-single.h
- *
- * Code specific to PKUnity SoC and UniCore ISA
- *
- * Copyright (C) 2001-2010 GUAN Xue-tao
- */
-#ifndef __UNICORE_CPU_SINGLE_H__
-#define __UNICORE_CPU_SINGLE_H__
-
-#include <asm/page.h>
-#include <asm/memory.h>
-
-#ifdef __KERNEL__
-#ifndef __ASSEMBLY__
-
-#define cpu_switch_mm(pgd, mm) cpu_do_switch_mm(virt_to_phys(pgd), mm)
-
-#define cpu_get_pgd()					\
-	({						\
-		unsigned long pg;			\
-		__asm__("movc	%0, p0.c2, #0"		\
-			 : "=r" (pg) : : "cc");		\
-		pg &= ~0x0fff;				\
-		(pgd_t *)phys_to_virt(pg);		\
-	})
-
-struct mm_struct;
-
-/* declare all the functions as extern */
-extern void cpu_proc_fin(void);
-extern int cpu_do_idle(void);
-extern void cpu_dcache_clean_area(void *, int);
-extern void cpu_do_switch_mm(unsigned long pgd_phys, struct mm_struct *mm);
-extern void cpu_set_pte(pte_t *ptep, pte_t pte);
-extern void cpu_reset(unsigned long addr) __attribute__((noreturn));
-
-#endif /* __ASSEMBLY__ */
-#endif /* __KERNEL__ */
-
-#endif /* __UNICORE_CPU_SINGLE_H__ */
diff --git a/arch/unicore32/include/asm/cputype.h b/arch/unicore32/include/asm/cputype.h
deleted file mode 100644
index 08a47e3bdbcc..000000000000
--- a/arch/unicore32/include/asm/cputype.h
+++ /dev/null
@@ -1,30 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/*
- * linux/arch/unicore32/include/asm/cputype.h
- *
- * Code specific to PKUnity SoC and UniCore ISA
- *
- * Copyright (C) 2001-2010 GUAN Xue-tao
- */
-#ifndef __UNICORE_CPUTYPE_H__
-#define __UNICORE_CPUTYPE_H__
-
-#include <linux/stringify.h>
-
-#define CPUID_CPUID	0
-#define CPUID_CACHETYPE	1
-
-#define read_cpuid(reg)							\
-	({								\
-		unsigned int __val;					\
-		asm("movc	%0, p0.c0, #" __stringify(reg)		\
-		    : "=r" (__val)					\
-		    :							\
-		    : "cc");						\
-		__val;							\
-	})
-
-#define uc32_cpuid		read_cpuid(CPUID_CPUID)
-#define uc32_cachetype		read_cpuid(CPUID_CACHETYPE)
-
-#endif
diff --git a/arch/unicore32/include/asm/delay.h b/arch/unicore32/include/asm/delay.h
deleted file mode 100644
index 934193edfa66..000000000000
--- a/arch/unicore32/include/asm/delay.h
+++ /dev/null
@@ -1,49 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/*
- * linux/arch/unicore32/include/asm/delay.h
- *
- * Code specific to PKUnity SoC and UniCore ISA
- *
- * Copyright (C) 2001-2010 GUAN Xue-tao
- *
- * Delay routines, using a pre-computed "loops_per_second" value.
- */
-#ifndef __UNICORE_DELAY_H__
-#define __UNICORE_DELAY_H__
-
-#include <asm/param.h>	/* HZ */
-
-extern void __delay(int loops);
-
-/*
- * This function intentionally does not exist; if you see references to
- * it, it means that you're calling udelay() with an out of range value.
- *
- * With currently imposed limits, this means that we support a max delay
- * of 2000us. Further limits: HZ<=1000 and bogomips<=3355
- */
-extern void __bad_udelay(void);
-
-/*
- * division by multiplication: you don't have to worry about
- * loss of precision.
- *
- * Use only for very small delays ( < 1 msec).  Should probably use a
- * lookup table, really, as the multiplications take much too long with
- * short delays.  This is a "reasonable" implementation, though (and the
- * first constant multiplications gets optimized away if the delay is
- * a constant)
- */
-extern void __udelay(unsigned long usecs);
-extern void __const_udelay(unsigned long);
-
-#define MAX_UDELAY_MS 2
-
-#define udelay(n)							\
-	(__builtin_constant_p(n) ?					\
-	  ((n) > (MAX_UDELAY_MS * 1000) ? __bad_udelay() :		\
-			__const_udelay((n) * ((2199023U*HZ)>>11))) :	\
-	  __udelay(n))
-
-#endif /* __UNICORE_DELAY_H__ */
-
diff --git a/arch/unicore32/include/asm/dma.h b/arch/unicore32/include/asm/dma.h
deleted file mode 100644
index 1326310b21e6..000000000000
--- a/arch/unicore32/include/asm/dma.h
+++ /dev/null
@@ -1,20 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/*
- * linux/arch/unicore32/include/asm/dma.h
- *
- * Code specific to PKUnity SoC and UniCore ISA
- *
- * Copyright (C) 2001-2010 GUAN Xue-tao
- */
-
-#ifndef __UNICORE_DMA_H__
-#define __UNICORE_DMA_H__
-
-#include <asm/memory.h>
-#include <asm-generic/dma.h>
-
-#ifdef CONFIG_PCI
-extern int isa_dma_bridge_buggy;
-#endif
-
-#endif /* __UNICORE_DMA_H__ */
diff --git a/arch/unicore32/include/asm/elf.h b/arch/unicore32/include/asm/elf.h
deleted file mode 100644
index a464ed5f05d4..000000000000
--- a/arch/unicore32/include/asm/elf.h
+++ /dev/null
@@ -1,90 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/*
- * linux/arch/unicore32/include/asm/elf.h
- *
- * Code specific to PKUnity SoC and UniCore ISA
- *
- * Copyright (C) 2001-2010 GUAN Xue-tao
- */
-
-#ifndef __UNICORE_ELF_H__
-#define __UNICORE_ELF_H__
-
-#include <asm/hwcap.h>
-
-/*
- * ELF register definitions..
- */
-#include <asm/ptrace.h>
-#include <linux/elf-em.h>
-
-typedef unsigned long elf_greg_t;
-typedef unsigned long elf_freg_t[3];
-
-#define ELF_NGREG (sizeof(struct pt_regs) / sizeof(elf_greg_t))
-typedef elf_greg_t elf_gregset_t[ELF_NGREG];
-
-typedef struct fp_state elf_fpregset_t;
-
-#define R_UNICORE_NONE		0
-#define R_UNICORE_PC24		1
-#define R_UNICORE_ABS32		2
-#define R_UNICORE_CALL		28
-#define R_UNICORE_JUMP24	29
-
-/*
- * These are used to set parameters in the core dumps.
- */
-#define ELF_CLASS	ELFCLASS32
-#define ELF_DATA	ELFDATA2LSB
-#define ELF_ARCH	EM_UNICORE
-
-/*
- * This yields a string that ld.so will use to load implementation
- * specific libraries for optimization.  This is more specific in
- * intent than poking at uname or /proc/cpuinfo.
- *
- */
-#define ELF_PLATFORM_SIZE 8
-#define ELF_PLATFORM	(elf_platform)
-
-extern char elf_platform[];
-
-struct elf32_hdr;
-
-/*
- * This is used to ensure we don't load something for the wrong architecture.
- */
-extern int elf_check_arch(const struct elf32_hdr *);
-#define elf_check_arch elf_check_arch
-
-struct task_struct;
-int dump_task_regs(struct task_struct *t, elf_gregset_t *elfregs);
-#define ELF_CORE_COPY_TASK_REGS dump_task_regs
-
-#define ELF_EXEC_PAGESIZE	4096
-
-/* This is the location that an ET_DYN program is loaded if exec'ed.  Typical
-   use of this is to invoke "./ld.so someprog" to test out a new version of
-   the loader.  We need to make sure that it is out of the way of the program
-   that it will "exec", and that there is sufficient room for the brk.  */
-
-#define ELF_ET_DYN_BASE	(2 * TASK_SIZE / 3)
-
-/* When the program starts, a1 contains a pointer to a function to be
-   registered with atexit, as per the SVR4 ABI.  A value of 0 means we
-   have no such handler.  */
-#define ELF_PLAT_INIT(_r, load_addr)	{(_r)->UCreg_00 = 0; }
-
-extern void elf_set_personality(const struct elf32_hdr *);
-#define SET_PERSONALITY(ex)	elf_set_personality(&(ex))
-
-struct mm_struct;
-extern unsigned long arch_randomize_brk(struct mm_struct *mm);
-#define arch_randomize_brk arch_randomize_brk
-
-extern int vectors_user_mapping(void);
-#define arch_setup_additional_pages(bprm, uses_interp) vectors_user_mapping()
-#define ARCH_HAS_SETUP_ADDITIONAL_PAGES
-
-#endif
diff --git a/arch/unicore32/include/asm/fpstate.h b/arch/unicore32/include/asm/fpstate.h
deleted file mode 100644
index 5811293e7a7e..000000000000
--- a/arch/unicore32/include/asm/fpstate.h
+++ /dev/null
@@ -1,23 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/*
- * linux/arch/unicore32/include/asm/fpstate.h
- *
- * Code specific to PKUnity SoC and UniCore ISA
- *
- * Copyright (C) 2001-2010 GUAN Xue-tao
- */
-
-#ifndef __UNICORE_FPSTATE_H__
-#define __UNICORE_FPSTATE_H__
-
-#ifndef __ASSEMBLY__
-
-#define FP_REGS_NUMBER		33
-
-struct fp_state {
-	unsigned int regs[FP_REGS_NUMBER];
-} __attribute__((aligned(8)));
-
-#endif
-
-#endif
diff --git a/arch/unicore32/include/asm/fpu-ucf64.h b/arch/unicore32/include/asm/fpu-ucf64.h
deleted file mode 100644
index 7a0c8a9e05d4..000000000000
--- a/arch/unicore32/include/asm/fpu-ucf64.h
+++ /dev/null
@@ -1,50 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/*
- * linux/arch/unicore32/include/asm/fpu-ucf64.h
- *
- * Code specific to PKUnity SoC and UniCore ISA
- *
- *	Maintained by GUAN Xue-tao <gxt@mprc.pku.edu.cn>
- *	Copyright (C) 2001-2010 Guan Xuetao
- */
-#define FPSCR			s31
-
-/* FPSCR bits */
-#define FPSCR_DEFAULT_NAN	(1<<25)
-
-#define FPSCR_CMPINSTR_BIT	(1<<31)
-
-#define FPSCR_CON		(1<<29)
-#define FPSCR_TRAP		(1<<27)
-
-/* RND mode */
-#define FPSCR_ROUND_NEAREST	(0<<0)
-#define FPSCR_ROUND_PLUSINF	(2<<0)
-#define FPSCR_ROUND_MINUSINF	(3<<0)
-#define FPSCR_ROUND_TOZERO	(1<<0)
-#define FPSCR_RMODE_BIT		(0)
-#define FPSCR_RMODE_MASK	(7 << FPSCR_RMODE_BIT)
-
-/* trap enable */
-#define FPSCR_IOE		(1<<16)
-#define FPSCR_OFE		(1<<14)
-#define FPSCR_UFE		(1<<13)
-#define FPSCR_IXE		(1<<12)
-#define FPSCR_HIE		(1<<11)
-#define FPSCR_NDE		(1<<10)	/* non denomal */
-
-/* flags */
-#define FPSCR_IDC		(1<<24)
-#define FPSCR_HIC		(1<<23)
-#define FPSCR_IXC		(1<<22)
-#define FPSCR_OFC		(1<<21)
-#define FPSCR_UFC		(1<<20)
-#define FPSCR_IOC		(1<<19)
-
-/* stick bits */
-#define FPSCR_IOS		(1<<9)
-#define FPSCR_OFS		(1<<7)
-#define FPSCR_UFS		(1<<6)
-#define FPSCR_IXS		(1<<5)
-#define FPSCR_HIS		(1<<4)
-#define FPSCR_NDS		(1<<3)	/*non denomal */
diff --git a/arch/unicore32/include/asm/gpio.h b/arch/unicore32/include/asm/gpio.h
deleted file mode 100644
index dfad04ca0a65..000000000000
--- a/arch/unicore32/include/asm/gpio.h
+++ /dev/null
@@ -1,101 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/*
- * linux/arch/unicore32/include/asm/gpio.h
- *
- * Code specific to PKUnity SoC and UniCore ISA
- *
- * Copyright (C) 2001-2010 GUAN Xue-tao
- */
-
-#ifndef __UNICORE_GPIO_H__
-#define __UNICORE_GPIO_H__
-
-#include <linux/io.h>
-#include <asm/irq.h>
-#include <mach/hardware.h>
-#include <asm-generic/gpio.h>
-
-#define GPI_OTP_INT             0
-#define GPI_PCI_INTA            1
-#define GPI_PCI_INTB            2
-#define GPI_PCI_INTC            3
-#define GPI_PCI_INTD            4
-#define GPI_BAT_DET             5
-#define GPI_SD_CD               6
-#define GPI_SOFF_REQ            7
-#define GPI_SD_WP               8
-#define GPI_LCD_CASE_OFF        9
-#define GPO_WIFI_EN             10
-#define GPO_HDD_LED             11
-#define GPO_VGA_EN              12
-#define GPO_LCD_EN              13
-#define GPO_LED_DATA            14
-#define GPO_LED_CLK             15
-#define GPO_CAM_PWR_EN          16
-#define GPO_LCD_VCC_EN          17
-#define GPO_SOFT_OFF            18
-#define GPO_BT_EN               19
-#define GPO_FAN_ON              20
-#define GPO_SPKR                21
-#define GPO_SET_V1              23
-#define GPO_SET_V2              24
-#define GPO_CPU_HEALTH          25
-#define GPO_LAN_SEL             26
-
-#ifdef CONFIG_PUV3_NB0916
-#define GPI_BTN_TOUCH		14
-#define GPIO_IN			0x000043ff /* 1 for input */
-#define GPIO_OUT		0x0fffbc00 /* 1 for output */
-#endif	/* CONFIG_PUV3_NB0916 */
-
-#ifdef CONFIG_PUV3_SMW0919
-#define GPIO_IN			0x000003ff /* 1 for input */
-#define GPIO_OUT		0x0ffffc00 /* 1 for output */
-#endif  /* CONFIG_PUV3_SMW0919 */
-
-#ifdef CONFIG_PUV3_DB0913
-#define GPIO_IN			0x000001df /* 1 for input */
-#define GPIO_OUT		0x03fee800 /* 1 for output */
-#endif  /* CONFIG_PUV3_DB0913 */
-
-#define GPIO_DIR                (~((GPIO_IN) | 0xf0000000))
-				/* 0 input, 1 output */
-
-static inline int gpio_get_value(unsigned gpio)
-{
-	if (__builtin_constant_p(gpio) && (gpio <= GPIO_MAX))
-		return readl(GPIO_GPLR) & GPIO_GPIO(gpio);
-	else
-		return __gpio_get_value(gpio);
-}
-
-static inline void gpio_set_value(unsigned gpio, int value)
-{
-	if (__builtin_constant_p(gpio) && (gpio <= GPIO_MAX))
-		if (value)
-			writel(GPIO_GPIO(gpio), GPIO_GPSR);
-		else
-			writel(GPIO_GPIO(gpio), GPIO_GPCR);
-	else
-		__gpio_set_value(gpio, value);
-}
-
-#define gpio_cansleep	__gpio_cansleep
-
-static inline unsigned gpio_to_irq(unsigned gpio)
-{
-	if ((gpio < IRQ_GPIOHIGH) && (FIELD(1, 1, gpio) & readl(GPIO_GPIR)))
-		return IRQ_GPIOLOW0 + gpio;
-	else
-		return IRQ_GPIO0 + gpio;
-}
-
-static inline unsigned irq_to_gpio(unsigned irq)
-{
-	if (irq < IRQ_GPIOHIGH)
-		return irq - IRQ_GPIOLOW0;
-	else
-		return irq - IRQ_GPIO0;
-}
-
-#endif /* __UNICORE_GPIO_H__ */
diff --git a/arch/unicore32/include/asm/hwcap.h b/arch/unicore32/include/asm/hwcap.h
deleted file mode 100644
index 2e15ffbe8391..000000000000
--- a/arch/unicore32/include/asm/hwcap.h
+++ /dev/null
@@ -1,29 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/*
- * linux/arch/unicore32/include/asm/hwcap.h
- *
- * Code specific to PKUnity SoC and UniCore ISA
- *
- * Copyright (C) 2001-2010 GUAN Xue-tao
- */
-#ifndef __UNICORE_HWCAP_H__
-#define __UNICORE_HWCAP_H__
-
-/*
- * HWCAP flags
- */
-#define HWCAP_MSP		1
-#define HWCAP_UNICORE16		2
-#define HWCAP_CMOV		4
-#define HWCAP_UNICORE_F64       8
-#define HWCAP_TLS		0x80
-
-#if defined(__KERNEL__) && !defined(__ASSEMBLY__)
-/*
- * This yields a mask that user programs can use to figure out what
- * instruction set this cpu supports.
- */
-#define ELF_HWCAP		(HWCAP_CMOV | HWCAP_UNICORE_F64)
-#endif
-
-#endif
diff --git a/arch/unicore32/include/asm/hwdef-copro.h b/arch/unicore32/include/asm/hwdef-copro.h
deleted file mode 100644
index 2db8cf864e43..000000000000
--- a/arch/unicore32/include/asm/hwdef-copro.h
+++ /dev/null
@@ -1,45 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/*
- * Co-processor register definitions for PKUnity SoC and UniCore ISA
- *
- * Copyright (C) 2001-2012 GUAN Xue-tao
- */
-#ifndef __UNICORE_HWDEF_COPRO_H__
-#define __UNICORE_HWDEF_COPRO_H__
-
-/*
- * Control Register bits (CP#0 CR1)
- */
-#define CR_M	(1 << 0)	/* MMU enable				*/
-#define CR_A	(1 << 1)	/* Alignment abort enable		*/
-#define CR_D	(1 << 2)	/* Dcache enable			*/
-#define CR_I	(1 << 3)	/* Icache enable			*/
-#define CR_B	(1 << 4)	/* Dcache write mechanism: write back	*/
-#define CR_T	(1 << 5)	/* Burst enable				*/
-#define CR_V	(1 << 13)	/* Vectors relocated to 0xffff0000	*/
-
-#ifndef __ASSEMBLY__
-
-#define vectors_high()		(cr_alignment & CR_V)
-
-extern unsigned long cr_no_alignment;	/* defined in entry.S */
-extern unsigned long cr_alignment;	/* defined in entry.S */
-
-static inline unsigned int get_cr(void)
-{
-	unsigned int val;
-	asm("movc %0, p0.c1, #0" : "=r" (val) : : "cc");
-	return val;
-}
-
-static inline void set_cr(unsigned int val)
-{
-	asm volatile("movc p0.c1, %0, #0" : : "r" (val) : "cc");
-	isb();
-}
-
-extern void adjust_cr(unsigned long mask, unsigned long set);
-
-#endif /* __ASSEMBLY__ */
-
-#endif /* __UNICORE_HWDEF_COPRO_H__ */
diff --git a/arch/unicore32/include/asm/io.h b/arch/unicore32/include/asm/io.h
deleted file mode 100644
index bd4e7c332f85..000000000000
--- a/arch/unicore32/include/asm/io.h
+++ /dev/null
@@ -1,69 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/*
- * linux/arch/unicore32/include/asm/io.h
- *
- * Code specific to PKUnity SoC and UniCore ISA
- *
- * Copyright (C) 2001-2010 GUAN Xue-tao
- */
-#ifndef __UNICORE_IO_H__
-#define __UNICORE_IO_H__
-
-#ifdef __KERNEL__
-
-#include <asm/byteorder.h>
-#include <asm/memory.h>
-
-#define PCI_IOBASE	PKUNITY_PCILIO_BASE
-#include <asm-generic/io.h>
-
-/*
- * __uc32_ioremap takes CPU physical address.
- */
-extern void __iomem *__uc32_ioremap(unsigned long, size_t);
-extern void __uc32_iounmap(volatile void __iomem *addr);
-
-/*
- * ioremap and friends.
- *
- * ioremap takes a PCI memory address, as specified in
- * Documentation/driver-api/io-mapping.rst.
- *
- */
-#define ioremap(cookie, size)		__uc32_ioremap(cookie, size)
-#define iounmap(cookie)			__uc32_iounmap(cookie)
-
-#define readb_relaxed readb
-#define readw_relaxed readw
-#define readl_relaxed readl
-
-#define HAVE_ARCH_PIO_SIZE
-#define PIO_OFFSET		(unsigned int)(PCI_IOBASE)
-#define PIO_MASK		(unsigned int)(IO_SPACE_LIMIT)
-#define PIO_RESERVED		(PIO_OFFSET + PIO_MASK + 1)
-
-#ifdef CONFIG_STRICT_DEVMEM
-
-#include <linux/ioport.h>
-#include <linux/mm.h>
-
-/*
- * devmem_is_allowed() checks to see if /dev/mem access to a certain
- * address is valid. The argument is a physical page number.
- * We mimic x86 here by disallowing access to system RAM as well as
- * device-exclusive MMIO regions. This effectively disable read()/write()
- * on /dev/mem.
- */
-static inline int devmem_is_allowed(unsigned long pfn)
-{
-	if (iomem_is_exclusive(pfn << PAGE_SHIFT))
-		return 0;
-	if (!page_is_ram(pfn))
-		return 1;
-	return 0;
-}
-
-#endif /* CONFIG_STRICT_DEVMEM */
-
-#endif	/* __KERNEL__ */
-#endif	/* __UNICORE_IO_H__ */
diff --git a/arch/unicore32/include/asm/irq.h b/arch/unicore32/include/asm/irq.h
deleted file mode 100644
index 3f7f07c0338c..000000000000
--- a/arch/unicore32/include/asm/irq.h
+++ /dev/null
@@ -1,102 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/*
- * linux/arch/unicore32/include/asm/irq.h
- *
- * Code specific to PKUnity SoC and UniCore ISA
- *
- * Copyright (C) 2001-2010 GUAN Xue-tao
- */
-#ifndef __UNICORE_IRQ_H__
-#define __UNICORE_IRQ_H__
-
-#include <asm-generic/irq.h>
-
-#define	IRQ_GPIOLOW0		0x00
-#define	IRQ_GPIOLOW1		0x01
-#define	IRQ_GPIOLOW2		0x02
-#define	IRQ_GPIOLOW3		0x03
-#define	IRQ_GPIOLOW4		0x04
-#define	IRQ_GPIOLOW5		0x05
-#define	IRQ_GPIOLOW6		0x06
-#define	IRQ_GPIOLOW7		0x07
-#define IRQ_GPIOHIGH		0x08
-#define IRQ_USB			0x09
-#define IRQ_SDC			0x0a
-#define IRQ_AC97		0x0b
-#define IRQ_SATA		0x0c
-#define IRQ_MME			0x0d
-#define IRQ_PCI_BRIDGE		0x0e
-#define	IRQ_DDR			0x0f
-#define	IRQ_SPI			0x10
-#define	IRQ_UNIGFX		0x11
-#define	IRQ_I2C			0x11
-#define	IRQ_UART1		0x12
-#define	IRQ_UART0		0x13
-#define IRQ_UMAL		0x14
-#define IRQ_NAND		0x15
-#define IRQ_PS2_KBD		0x16
-#define IRQ_PS2_AUX		0x17
-#define IRQ_DMA			0x18
-#define IRQ_DMAERR		0x19
-#define	IRQ_TIMER0		0x1a
-#define	IRQ_TIMER1		0x1b
-#define	IRQ_TIMER2		0x1c
-#define	IRQ_TIMER3		0x1d
-#define	IRQ_RTC			0x1e
-#define	IRQ_RTCAlarm		0x1f
-
-#define	IRQ_GPIO0		0x20
-#define	IRQ_GPIO1		0x21
-#define	IRQ_GPIO2		0x22
-#define	IRQ_GPIO3		0x23
-#define	IRQ_GPIO4		0x24
-#define	IRQ_GPIO5		0x25
-#define	IRQ_GPIO6		0x26
-#define	IRQ_GPIO7		0x27
-#define IRQ_GPIO8		0x28
-#define IRQ_GPIO9		0x29
-#define IRQ_GPIO10		0x2a
-#define IRQ_GPIO11		0x2b
-#define IRQ_GPIO12		0x2c
-#define IRQ_GPIO13		0x2d
-#define IRQ_GPIO14		0x2e
-#define IRQ_GPIO15		0x2f
-#define IRQ_GPIO16		0x30
-#define IRQ_GPIO17		0x31
-#define IRQ_GPIO18		0x32
-#define IRQ_GPIO19		0x33
-#define IRQ_GPIO20		0x34
-#define IRQ_GPIO21		0x35
-#define IRQ_GPIO22		0x36
-#define IRQ_GPIO23		0x37
-#define IRQ_GPIO24		0x38
-#define IRQ_GPIO25		0x39
-#define IRQ_GPIO26		0x3a
-#define IRQ_GPIO27		0x3b
-
-#ifdef CONFIG_ARCH_FPGA
-#define IRQ_PCIINTA             IRQ_GPIOLOW2
-#define IRQ_PCIINTB             IRQ_GPIOLOW1
-#define IRQ_PCIINTC             IRQ_GPIOLOW0
-#define IRQ_PCIINTD             IRQ_GPIOLOW6
-#endif
-
-#if defined(CONFIG_PUV3_DB0913) || defined(CONFIG_PUV3_NB0916)	\
-	|| defined(CONFIG_PUV3_SMW0919)
-#define IRQ_PCIINTA             IRQ_GPIOLOW1
-#define IRQ_PCIINTB             IRQ_GPIOLOW2
-#define IRQ_PCIINTC             IRQ_GPIOLOW3
-#define IRQ_PCIINTD             IRQ_GPIOLOW4
-#endif
-
-#define IRQ_SD_CD               IRQ_GPIO6 /* falling or rising trigger */
-
-#ifndef __ASSEMBLY__
-struct pt_regs;
-
-extern void asm_do_IRQ(unsigned int, struct pt_regs *);
-
-#endif
-
-#endif
-
diff --git a/arch/unicore32/include/asm/irqflags.h b/arch/unicore32/include/asm/irqflags.h
deleted file mode 100644
index f64c82e3eae6..000000000000
--- a/arch/unicore32/include/asm/irqflags.h
+++ /dev/null
@@ -1,50 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/*
- * linux/arch/unicore32/include/asm/irqflags.h
- *
- * Code specific to PKUnity SoC and UniCore ISA
- *
- * Copyright (C) 2001-2010 GUAN Xue-tao
- */
-#ifndef __UNICORE_IRQFLAGS_H__
-#define __UNICORE_IRQFLAGS_H__
-
-#ifdef __KERNEL__
-
-#include <asm/ptrace.h>
-
-#define ARCH_IRQ_DISABLED	(PRIV_MODE | PSR_I_BIT)
-#define ARCH_IRQ_ENABLED	(PRIV_MODE)
-
-/*
- * Save the current interrupt enable state.
- */
-static inline unsigned long arch_local_save_flags(void)
-{
-	unsigned long temp;
-
-	asm volatile("mov %0, asr" : "=r" (temp) : : "memory", "cc");
-
-	return temp & PSR_c;
-}
-
-/*
- * restore saved IRQ state
- */
-static inline void arch_local_irq_restore(unsigned long flags)
-{
-	unsigned long temp;
-
-	asm volatile(
-		"mov	%0, asr\n"
-		"mov.a	asr, %1\n"
-		"mov.f	asr, %0"
-		: "=&r" (temp)
-		: "r" (flags)
-		: "memory", "cc");
-}
-
-#include <asm-generic/irqflags.h>
-
-#endif
-#endif
diff --git a/arch/unicore32/include/asm/linkage.h b/arch/unicore32/include/asm/linkage.h
deleted file mode 100644
index 8e341ba7bc4a..000000000000
--- a/arch/unicore32/include/asm/linkage.h
+++ /dev/null
@@ -1,19 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/*
- * linux/arch/unicore32/include/asm/linkage.h
- *
- * Code specific to PKUnity SoC and UniCore ISA
- *
- * Copyright (C) 2001-2010 GUAN Xue-tao
- */
-#ifndef __UNICORE_LINKAGE_H__
-#define __UNICORE_LINKAGE_H__
-
-#define __ALIGN .align 0
-#define __ALIGN_STR ".align 0"
-
-#define ENDPROC(name) \
-	.type name, %function; \
-	END(name)
-
-#endif
diff --git a/arch/unicore32/include/asm/memblock.h b/arch/unicore32/include/asm/memblock.h
deleted file mode 100644
index eb56a6ddce83..000000000000
--- a/arch/unicore32/include/asm/memblock.h
+++ /dev/null
@@ -1,43 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/*
- * linux/arch/unicore32/include/asm/memblock.h
- *
- * Code specific to PKUnity SoC and UniCore ISA
- *
- * Copyright (C) 2001-2010 GUAN Xue-tao
- */
-
-#ifndef __UNICORE_MEMBLOCK_H__
-#define __UNICORE_MEMBLOCK_H__
-
-/*
- * Memory map description
- */
-# define NR_BANKS 8
-
-struct membank {
-	unsigned long start;
-	unsigned long size;
-	unsigned int highmem;
-};
-
-struct meminfo {
-	int nr_banks;
-	struct membank bank[NR_BANKS];
-};
-
-extern struct meminfo meminfo;
-
-#define for_each_bank(iter, mi)				\
-	for (iter = 0; iter < (mi)->nr_banks; iter++)
-
-#define bank_pfn_start(bank)	__phys_to_pfn((bank)->start)
-#define bank_pfn_end(bank)	__phys_to_pfn((bank)->start + (bank)->size)
-#define bank_pfn_size(bank)	((bank)->size >> PAGE_SHIFT)
-#define bank_phys_start(bank)	((bank)->start)
-#define bank_phys_end(bank)	((bank)->start + (bank)->size)
-#define bank_phys_size(bank)	((bank)->size)
-
-extern void uc32_memblock_init(struct meminfo *);
-
-#endif
diff --git a/arch/unicore32/include/asm/memory.h b/arch/unicore32/include/asm/memory.h
deleted file mode 100644
index 66285178dd9b..000000000000
--- a/arch/unicore32/include/asm/memory.h
+++ /dev/null
@@ -1,102 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/*
- * linux/arch/unicore32/include/asm/memory.h
- *
- * Code specific to PKUnity SoC and UniCore ISA
- *
- * Copyright (C) 2001-2010 GUAN Xue-tao
- *
- *  Note: this file should not be included by non-asm/.h files
- */
-#ifndef __UNICORE_MEMORY_H__
-#define __UNICORE_MEMORY_H__
-
-#include <linux/compiler.h>
-#include <linux/const.h>
-#include <linux/sizes.h>
-#include <mach/memory.h>
-
-/*
- * PAGE_OFFSET - the virtual address of the start of the kernel image
- * TASK_SIZE - the maximum size of a user space task.
- * TASK_UNMAPPED_BASE - the lower boundary of the mmap VM area
- */
-#define PAGE_OFFSET		UL(0xC0000000)
-#define TASK_SIZE		(PAGE_OFFSET - UL(0x41000000))
-#define TASK_UNMAPPED_BASE	(PAGE_OFFSET / 3)
-
-/*
- * The module space lives between the addresses given by TASK_SIZE
- * and PAGE_OFFSET - it must be within 32MB of the kernel text.
- */
-#define MODULES_VADDR		(PAGE_OFFSET - 16*1024*1024)
-#if TASK_SIZE > MODULES_VADDR
-#error Top of user space clashes with start of module space
-#endif
-
-#define MODULES_END		(PAGE_OFFSET)
-
-/*
- * Allow 16MB-aligned ioremap pages
- */
-#define IOREMAP_MAX_ORDER	24
-
-/*
- * Physical vs virtual RAM address space conversion.  These are
- * private definitions which should NOT be used outside memory.h
- * files.  Use virt_to_phys/phys_to_virt/__pa/__va instead.
- */
-#ifndef __virt_to_phys
-#define __virt_to_phys(x)	((x) - PAGE_OFFSET + PHYS_OFFSET)
-#define __phys_to_virt(x)	((x) - PHYS_OFFSET + PAGE_OFFSET)
-#endif
-
-/*
- * Convert a page to/from a physical address
- */
-#define page_to_phys(page)	(__pfn_to_phys(page_to_pfn(page)))
-#define phys_to_page(phys)	(pfn_to_page(__phys_to_pfn(phys)))
-
-#ifndef __ASSEMBLY__
-
-#ifndef arch_adjust_zones
-#define arch_adjust_zones(max_zone_pfn) do { } while (0)
-#endif
-
-/*
- * PFNs are used to describe any physical page; this means
- * PFN 0 == physical address 0.
- *
- * This is the PFN of the first RAM page in the kernel
- * direct-mapped view.  We assume this is the first page
- * of RAM in the mem_map as well.
- */
-#define PHYS_PFN_OFFSET	(PHYS_OFFSET >> PAGE_SHIFT)
-
-/*
- * Drivers should NOT use these either.
- */
-#define __pa(x)			__virt_to_phys((unsigned long)(x))
-#define __va(x)			((void *)__phys_to_virt((unsigned long)(x)))
-#define pfn_to_kaddr(pfn)	__va((pfn) << PAGE_SHIFT)
-
-/*
- * Conversion between a struct page and a physical address.
- *
- *  page_to_pfn(page)	convert a struct page * to a PFN number
- *  pfn_to_page(pfn)	convert a _valid_ PFN number to struct page *
- *
- *  virt_to_page(k)	convert a _valid_ virtual address to struct page *
- *  virt_addr_valid(k)	indicates whether a virtual address is valid
- */
-#define ARCH_PFN_OFFSET		PHYS_PFN_OFFSET
-
-#define virt_to_page(kaddr)	pfn_to_page(__pa(kaddr) >> PAGE_SHIFT)
-#define virt_addr_valid(kaddr)	((unsigned long)(kaddr) >= PAGE_OFFSET && \
-		(unsigned long)(kaddr) < (unsigned long)high_memory)
-
-#endif
-
-#include <asm-generic/memory_model.h>
-
-#endif
diff --git a/arch/unicore32/include/asm/mmu.h b/arch/unicore32/include/asm/mmu.h
deleted file mode 100644
index 8ad4e7eae17b..000000000000
--- a/arch/unicore32/include/asm/mmu.h
+++ /dev/null
@@ -1,14 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/*
- * linux/arch/unicore32/include/asm/mmu.h
- *
- * Code specific to PKUnity SoC and UniCore ISA
- *
- * Copyright (C) 2001-2010 GUAN Xue-tao
- */
-#ifndef __UNICORE_MMU_H__
-#define __UNICORE_MMU_H__
-
-typedef	unsigned long mm_context_t;
-
-#endif
diff --git a/arch/unicore32/include/asm/mmu_context.h b/arch/unicore32/include/asm/mmu_context.h
deleted file mode 100644
index 388c0c811c68..000000000000
--- a/arch/unicore32/include/asm/mmu_context.h
+++ /dev/null
@@ -1,98 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/*
- * linux/arch/unicore32/include/asm/mmu_context.h
- *
- * Code specific to PKUnity SoC and UniCore ISA
- *
- * Copyright (C) 2001-2010 GUAN Xue-tao
- */
-#ifndef __UNICORE_MMU_CONTEXT_H__
-#define __UNICORE_MMU_CONTEXT_H__
-
-#include <linux/compiler.h>
-#include <linux/sched.h>
-#include <linux/mm.h>
-#include <linux/vmacache.h>
-#include <linux/io.h>
-
-#include <asm/cacheflush.h>
-#include <asm/cpu-single.h>
-
-#define init_new_context(tsk, mm)	0
-
-#define destroy_context(mm)		do { } while (0)
-
-/*
- * This is called when "tsk" is about to enter lazy TLB mode.
- *
- * mm:  describes the currently active mm context
- * tsk: task which is entering lazy tlb
- * cpu: cpu number which is entering lazy tlb
- *
- * tsk->mm will be NULL
- */
-static inline void
-enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk)
-{
-}
-
-/*
- * This is the actual mm switch as far as the scheduler
- * is concerned.  No registers are touched.  We avoid
- * calling the CPU specific function when the mm hasn't
- * actually changed.
- */
-static inline void
-switch_mm(struct mm_struct *prev, struct mm_struct *next,
-	  struct task_struct *tsk)
-{
-	unsigned int cpu = smp_processor_id();
-
-	if (!cpumask_test_and_set_cpu(cpu, mm_cpumask(next)) || prev != next)
-		cpu_switch_mm(next->pgd, next);
-}
-
-#define deactivate_mm(tsk, mm)	do { } while (0)
-#define activate_mm(prev, next)	switch_mm(prev, next, NULL)
-
-/*
- * We are inserting a "fake" vma for the user-accessible vector page so
- * gdb and friends can get to it through ptrace and /proc/<pid>/mem.
- * But we also want to remove it before the generic code gets to see it
- * during process exit or the unmapping of it would  cause total havoc.
- * (the macro is used as remove_vma() is static to mm/mmap.c)
- */
-#define arch_exit_mmap(mm) \
-do { \
-	struct vm_area_struct *high_vma = find_vma(mm, 0xffff0000); \
-	if (high_vma) { \
-		BUG_ON(high_vma->vm_next);  /* it should be last */ \
-		if (high_vma->vm_prev) \
-			high_vma->vm_prev->vm_next = NULL; \
-		else \
-			mm->mmap = NULL; \
-		rb_erase(&high_vma->vm_rb, &mm->mm_rb); \
-		vmacache_invalidate(mm); \
-		mm->map_count--; \
-		remove_vma(high_vma); \
-	} \
-} while (0)
-
-static inline int arch_dup_mmap(struct mm_struct *oldmm,
-				struct mm_struct *mm)
-{
-	return 0;
-}
-
-static inline void arch_unmap(struct mm_struct *mm,
-			unsigned long start, unsigned long end)
-{
-}
-
-static inline bool arch_vma_access_permitted(struct vm_area_struct *vma,
-		bool write, bool execute, bool foreign)
-{
-	/* by default, allow everything */
-	return true;
-}
-#endif
diff --git a/arch/unicore32/include/asm/page.h b/arch/unicore32/include/asm/page.h
deleted file mode 100644
index 96d6bdf180bd..000000000000
--- a/arch/unicore32/include/asm/page.h
+++ /dev/null
@@ -1,74 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/*
- * linux/arch/unicore32/include/asm/page.h
- *
- * Code specific to PKUnity SoC and UniCore ISA
- *
- * Copyright (C) 2001-2010 GUAN Xue-tao
- */
-#ifndef __UNICORE_PAGE_H__
-#define __UNICORE_PAGE_H__
-
-/* PAGE_SHIFT determines the page size */
-#define PAGE_SHIFT		12
-#define PAGE_SIZE		(_AC(1, UL) << PAGE_SHIFT)
-#define PAGE_MASK		(~(PAGE_SIZE-1))
-
-#ifndef __ASSEMBLY__
-
-struct page;
-struct vm_area_struct;
-
-#define clear_page(page)	memset((void *)(page), 0, PAGE_SIZE)
-extern void copy_page(void *to, const void *from);
-
-#define clear_user_page(page, vaddr, pg)	clear_page(page)
-#define copy_user_page(to, from, vaddr, pg)	copy_page(to, from)
-
-#undef STRICT_MM_TYPECHECKS
-
-#ifdef STRICT_MM_TYPECHECKS
-/*
- * These are used to make use of C type-checking..
- */
-typedef struct { unsigned long pte; } pte_t;
-typedef struct { unsigned long pgd; } pgd_t;
-typedef struct { unsigned long pgprot; } pgprot_t;
-
-#define pte_val(x)      ((x).pte)
-#define pgd_val(x)	((x).pgd)
-#define pgprot_val(x)   ((x).pgprot)
-
-#define __pte(x)        ((pte_t) { (x) })
-#define __pgd(x)	((pgd_t) { (x) })
-#define __pgprot(x)     ((pgprot_t) { (x) })
-
-#else
-/*
- * .. while these make it easier on the compiler
- */
-typedef unsigned long pte_t;
-typedef unsigned long pgd_t;
-typedef unsigned long pgprot_t;
-
-#define pte_val(x)      (x)
-#define pgd_val(x)      (x)
-#define pgprot_val(x)   (x)
-
-#define __pte(x)        (x)
-#define __pgd(x)	(x)
-#define __pgprot(x)     (x)
-
-#endif /* STRICT_MM_TYPECHECKS */
-
-typedef struct page *pgtable_t;
-
-extern int pfn_valid(unsigned long);
-
-#include <asm/memory.h>
-
-#endif /* !__ASSEMBLY__ */
-
-#include <asm-generic/getorder.h>
-
-#endif
diff --git a/arch/unicore32/include/asm/pci.h b/arch/unicore32/include/asm/pci.h
deleted file mode 100644
index 3efa8ee1afce..000000000000
--- a/arch/unicore32/include/asm/pci.h
+++ /dev/null
@@ -1,20 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/*
- * linux/arch/unicore32/include/asm/pci.h
- *
- * Code specific to PKUnity SoC and UniCore ISA
- *
- * Copyright (C) 2001-2010 GUAN Xue-tao
- */
-#ifndef __UNICORE_PCI_H__
-#define __UNICORE_PCI_H__
-
-#ifdef __KERNEL__
-#include <asm-generic/pci.h>
-#include <mach/hardware.h> /* for PCIBIOS_MIN_* */
-
-#define HAVE_PCI_MMAP
-#define ARCH_GENERIC_PCI_MMAP_RESOURCE
-
-#endif /* __KERNEL__ */
-#endif
diff --git a/arch/unicore32/include/asm/pgalloc.h b/arch/unicore32/include/asm/pgalloc.h
deleted file mode 100644
index ba1c9a79993b..000000000000
--- a/arch/unicore32/include/asm/pgalloc.h
+++ /dev/null
@@ -1,87 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/*
- * linux/arch/unicore32/include/asm/pgalloc.h
- *
- * Code specific to PKUnity SoC and UniCore ISA
- *
- * Copyright (C) 2001-2010 GUAN Xue-tao
- */
-#ifndef __UNICORE_PGALLOC_H__
-#define __UNICORE_PGALLOC_H__
-
-#include <asm/pgtable-hwdef.h>
-#include <asm/processor.h>
-#include <asm/cacheflush.h>
-#include <asm/tlbflush.h>
-
-#define __HAVE_ARCH_PTE_ALLOC_ONE_KERNEL
-#define __HAVE_ARCH_PTE_ALLOC_ONE
-#include <asm-generic/pgalloc.h>
-
-#define _PAGE_USER_TABLE	(PMD_TYPE_TABLE | PMD_PRESENT)
-#define _PAGE_KERNEL_TABLE	(PMD_TYPE_TABLE | PMD_PRESENT)
-
-extern pgd_t *get_pgd_slow(struct mm_struct *mm);
-extern void free_pgd_slow(struct mm_struct *mm, pgd_t *pgd);
-
-#define pgd_alloc(mm)			get_pgd_slow(mm)
-#define pgd_free(mm, pgd)		free_pgd_slow(mm, pgd)
-
-/*
- * Allocate one PTE table.
- */
-static inline pte_t *
-pte_alloc_one_kernel(struct mm_struct *mm)
-{
-	pte_t *pte = __pte_alloc_one_kernel(mm);
-
-	if (pte)
-		clean_dcache_area(pte, PTRS_PER_PTE * sizeof(pte_t));
-
-	return pte;
-}
-
-static inline pgtable_t
-pte_alloc_one(struct mm_struct *mm)
-{
-	struct page *pte;
-
-	pte = __pte_alloc_one(mm, GFP_PGTABLE_USER);
-	if (!pte)
-		return NULL;
-	if (!PageHighMem(pte))
-		clean_pte_table(page_address(pte));
-	return pte;
-}
-
-static inline void __pmd_populate(pmd_t *pmdp, unsigned long pmdval)
-{
-	set_pmd(pmdp, __pmd(pmdval));
-	flush_pmd_entry(pmdp);
-}
-
-/*
- * Populate the pmdp entry with a pointer to the pte.  This pmd is part
- * of the mm address space.
- */
-static inline void
-pmd_populate_kernel(struct mm_struct *mm, pmd_t *pmdp, pte_t *ptep)
-{
-	unsigned long pte_ptr = (unsigned long)ptep;
-
-	/*
-	 * The pmd must be loaded with the physical
-	 * address of the PTE table
-	 */
-	__pmd_populate(pmdp, __pa(pte_ptr) | _PAGE_KERNEL_TABLE);
-}
-
-static inline void
-pmd_populate(struct mm_struct *mm, pmd_t *pmdp, pgtable_t ptep)
-{
-	__pmd_populate(pmdp,
-			page_to_pfn(ptep) << PAGE_SHIFT | _PAGE_USER_TABLE);
-}
-#define pmd_pgtable(pmd) pmd_page(pmd)
-
-#endif
diff --git a/arch/unicore32/include/asm/pgtable-hwdef.h b/arch/unicore32/include/asm/pgtable-hwdef.h
deleted file mode 100644
index f28b58c61db9..000000000000
--- a/arch/unicore32/include/asm/pgtable-hwdef.h
+++ /dev/null
@@ -1,51 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/*
- * linux/arch/unicore32/include/asm/pgtable-hwdef.h
- *
- * Code specific to PKUnity SoC and UniCore ISA
- *
- * Copyright (C) 2001-2010 GUAN Xue-tao
- */
-#ifndef __UNICORE_PGTABLE_HWDEF_H__
-#define __UNICORE_PGTABLE_HWDEF_H__
-
-/*
- * Hardware page table definitions.
- *
- * + Level 1 descriptor (PMD)
- *   - common
- */
-#define PMD_TYPE_MASK		(3 << 0)
-#define PMD_TYPE_TABLE		(0 << 0)
-/*#define PMD_TYPE_LARGE	(1 << 0) */
-#define PMD_TYPE_INVALID	(2 << 0)
-#define PMD_TYPE_SECT		(3 << 0)
-
-#define PMD_PRESENT		(1 << 2)
-#define PMD_YOUNG		(1 << 3)
-
-/*#define PMD_SECT_DIRTY	(1 << 4) */
-#define PMD_SECT_CACHEABLE	(1 << 5)
-#define PMD_SECT_EXEC		(1 << 6)
-#define PMD_SECT_WRITE		(1 << 7)
-#define PMD_SECT_READ		(1 << 8)
-
-/*
- * + Level 2 descriptor (PTE)
- *   - common
- */
-#define PTE_TYPE_MASK		(3 << 0)
-#define PTE_TYPE_SMALL		(0 << 0)
-#define PTE_TYPE_MIDDLE		(1 << 0)
-#define PTE_TYPE_LARGE		(2 << 0)
-#define PTE_TYPE_INVALID	(3 << 0)
-
-#define PTE_PRESENT		(1 << 2)
-#define PTE_YOUNG		(1 << 3)
-#define PTE_DIRTY		(1 << 4)
-#define PTE_CACHEABLE		(1 << 5)
-#define PTE_EXEC		(1 << 6)
-#define PTE_WRITE		(1 << 7)
-#define PTE_READ		(1 << 8)
-
-#endif
diff --git a/arch/unicore32/include/asm/pgtable.h b/arch/unicore32/include/asm/pgtable.h
deleted file mode 100644
index 97f564c8ecba..000000000000
--- a/arch/unicore32/include/asm/pgtable.h
+++ /dev/null
@@ -1,267 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/*
- * linux/arch/unicore32/include/asm/pgtable.h
- *
- * Code specific to PKUnity SoC and UniCore ISA
- *
- * Copyright (C) 2001-2010 GUAN Xue-tao
- */
-#ifndef __UNICORE_PGTABLE_H__
-#define __UNICORE_PGTABLE_H__
-
-#include <asm-generic/pgtable-nopmd.h>
-#include <asm/cpu-single.h>
-
-#include <asm/memory.h>
-#include <asm/pgtable-hwdef.h>
-
-/*
- * Just any arbitrary offset to the start of the vmalloc VM area: the
- * current 8MB value just means that there will be a 8MB "hole" after the
- * physical memory until the kernel virtual memory starts.  That means that
- * any out-of-bounds memory accesses will hopefully be caught.
- * The vmalloc() routines leaves a hole of 4kB between each vmalloced
- * area for the same reason. ;)
- *
- * Note that platforms may override VMALLOC_START, but they must provide
- * VMALLOC_END.  VMALLOC_END defines the (exclusive) limit of this space,
- * which may not overlap IO space.
- */
-#ifndef VMALLOC_START
-#define VMALLOC_OFFSET		SZ_8M
-#define VMALLOC_START		(((unsigned long)high_memory + VMALLOC_OFFSET) \
-					& ~(VMALLOC_OFFSET-1))
-#define VMALLOC_END		(0xff000000UL)
-#endif
-
-#define PTRS_PER_PTE		1024
-#define PTRS_PER_PGD		1024
-
-/*
- * PGDIR_SHIFT determines what a third-level page table entry can map
- */
-#define PGDIR_SHIFT		22
-
-#ifndef __ASSEMBLY__
-extern void __pte_error(const char *file, int line, unsigned long val);
-extern void __pgd_error(const char *file, int line, unsigned long val);
-
-#define pte_ERROR(pte)		__pte_error(__FILE__, __LINE__, pte_val(pte))
-#define pgd_ERROR(pgd)		__pgd_error(__FILE__, __LINE__, pgd_val(pgd))
-#endif /* !__ASSEMBLY__ */
-
-#define PGDIR_SIZE		(1UL << PGDIR_SHIFT)
-#define PGDIR_MASK		(~(PGDIR_SIZE-1))
-
-/*
- * This is the lowest virtual address we can permit any user space
- * mapping to be mapped at.  This is particularly important for
- * non-high vector CPUs.
- */
-#define FIRST_USER_ADDRESS	PAGE_SIZE
-
-#define FIRST_USER_PGD_NR	1
-#define USER_PTRS_PER_PGD	((TASK_SIZE/PGDIR_SIZE) - FIRST_USER_PGD_NR)
-
-/*
- * section address mask and size definitions.
- */
-#define SECTION_SHIFT		22
-#define SECTION_SIZE		(1UL << SECTION_SHIFT)
-#define SECTION_MASK		(~(SECTION_SIZE-1))
-
-#ifndef __ASSEMBLY__
-
-/*
- * The pgprot_* and protection_map entries will be fixed up in runtime
- * to include the cachable bits based on memory policy, as well as any
- * architecture dependent bits.
- */
-#define _PTE_DEFAULT		(PTE_PRESENT | PTE_YOUNG | PTE_CACHEABLE)
-
-extern pgprot_t pgprot_user;
-extern pgprot_t pgprot_kernel;
-
-#define PAGE_NONE		pgprot_user
-#define PAGE_SHARED		__pgprot(pgprot_val(pgprot_user | PTE_READ \
-								| PTE_WRITE))
-#define PAGE_SHARED_EXEC	__pgprot(pgprot_val(pgprot_user | PTE_READ \
-								| PTE_WRITE \
-								| PTE_EXEC))
-#define PAGE_COPY		__pgprot(pgprot_val(pgprot_user | PTE_READ)
-#define PAGE_COPY_EXEC		__pgprot(pgprot_val(pgprot_user | PTE_READ \
-								| PTE_EXEC))
-#define PAGE_READONLY		__pgprot(pgprot_val(pgprot_user | PTE_READ))
-#define PAGE_READONLY_EXEC	__pgprot(pgprot_val(pgprot_user | PTE_READ \
-								| PTE_EXEC))
-#define PAGE_KERNEL		pgprot_kernel
-#define PAGE_KERNEL_EXEC	__pgprot(pgprot_val(pgprot_kernel | PTE_EXEC))
-
-#define __PAGE_NONE		__pgprot(_PTE_DEFAULT)
-#define __PAGE_SHARED		__pgprot(_PTE_DEFAULT | PTE_READ \
-							| PTE_WRITE)
-#define __PAGE_SHARED_EXEC	__pgprot(_PTE_DEFAULT | PTE_READ \
-							| PTE_WRITE \
-							| PTE_EXEC)
-#define __PAGE_COPY		__pgprot(_PTE_DEFAULT | PTE_READ)
-#define __PAGE_COPY_EXEC	__pgprot(_PTE_DEFAULT | PTE_READ \
-							| PTE_EXEC)
-#define __PAGE_READONLY		__pgprot(_PTE_DEFAULT | PTE_READ)
-#define __PAGE_READONLY_EXEC	__pgprot(_PTE_DEFAULT | PTE_READ \
-							| PTE_EXEC)
-
-#endif /* __ASSEMBLY__ */
-
-/*
- * The table below defines the page protection levels that we insert into our
- * Linux page table version.  These get translated into the best that the
- * architecture can perform.  Note that on UniCore hardware:
- *  1) We cannot do execute protection
- *  2) If we could do execute protection, then read is implied
- *  3) write implies read permissions
- */
-#define __P000  __PAGE_NONE
-#define __P001  __PAGE_READONLY
-#define __P010  __PAGE_COPY
-#define __P011  __PAGE_COPY
-#define __P100  __PAGE_READONLY_EXEC
-#define __P101  __PAGE_READONLY_EXEC
-#define __P110  __PAGE_COPY_EXEC
-#define __P111  __PAGE_COPY_EXEC
-
-#define __S000  __PAGE_NONE
-#define __S001  __PAGE_READONLY
-#define __S010  __PAGE_SHARED
-#define __S011  __PAGE_SHARED
-#define __S100  __PAGE_READONLY_EXEC
-#define __S101  __PAGE_READONLY_EXEC
-#define __S110  __PAGE_SHARED_EXEC
-#define __S111  __PAGE_SHARED_EXEC
-
-#ifndef __ASSEMBLY__
-/*
- * ZERO_PAGE is a global shared page that is always zero: used
- * for zero-mapped memory areas etc..
- */
-extern struct page *empty_zero_page;
-#define ZERO_PAGE(vaddr)		(empty_zero_page)
-
-#define pte_pfn(pte)			(pte_val(pte) >> PAGE_SHIFT)
-#define pfn_pte(pfn, prot)		(__pte(((pfn) << PAGE_SHIFT) \
-						| pgprot_val(prot)))
-
-#define pte_none(pte)			(!pte_val(pte))
-#define pte_clear(mm, addr, ptep)	set_pte(ptep, __pte(0))
-#define pte_page(pte)			(pfn_to_page(pte_pfn(pte)))
-
-#define set_pte(ptep, pte)	cpu_set_pte(ptep, pte)
-
-#define set_pte_at(mm, addr, ptep, pteval)	\
-	do {					\
-		set_pte(ptep, pteval);          \
-	} while (0)
-
-/*
- * The following only work if pte_present() is true.
- * Undefined behaviour if not..
- */
-#define pte_present(pte)	(pte_val(pte) & PTE_PRESENT)
-#define pte_write(pte)		(pte_val(pte) & PTE_WRITE)
-#define pte_dirty(pte)		(pte_val(pte) & PTE_DIRTY)
-#define pte_young(pte)		(pte_val(pte) & PTE_YOUNG)
-#define pte_exec(pte)		(pte_val(pte) & PTE_EXEC)
-
-#define PTE_BIT_FUNC(fn, op) \
-static inline pte_t pte_##fn(pte_t pte) { pte_val(pte) op; return pte; }
-
-PTE_BIT_FUNC(wrprotect, &= ~PTE_WRITE);
-PTE_BIT_FUNC(mkwrite,   |= PTE_WRITE);
-PTE_BIT_FUNC(mkclean,   &= ~PTE_DIRTY);
-PTE_BIT_FUNC(mkdirty,   |= PTE_DIRTY);
-PTE_BIT_FUNC(mkold,     &= ~PTE_YOUNG);
-PTE_BIT_FUNC(mkyoung,   |= PTE_YOUNG);
-
-/*
- * Mark the prot value as uncacheable.
- */
-#define pgprot_noncached(prot)		\
-	__pgprot(pgprot_val(prot) & ~PTE_CACHEABLE)
-#define pgprot_writecombine(prot)	\
-	__pgprot(pgprot_val(prot) & ~PTE_CACHEABLE)
-
-#define pmd_none(pmd)		(!pmd_val(pmd))
-#define pmd_present(pmd)	(pmd_val(pmd) & PMD_PRESENT)
-#define pmd_bad(pmd)		(((pmd_val(pmd) &		\
-				(PMD_PRESENT | PMD_TYPE_MASK))	\
-				!= (PMD_PRESENT | PMD_TYPE_TABLE)))
-
-#define set_pmd(pmdpd, pmdval)		\
-	do {				\
-		*(pmdpd) = pmdval;	\
-	} while (0)
-
-#define pmd_clear(pmdp)			\
-	do {				\
-		set_pmd(pmdp, __pmd(0));\
-		clean_pmd_entry(pmdp);	\
-	} while (0)
-
-#define pmd_page_vaddr(pmd) ((pte_t *)__va(pmd_val(pmd) & PAGE_MASK))
-#define pmd_page(pmd)		pfn_to_page(__phys_to_pfn(pmd_val(pmd)))
-
-/*
- * Conversion functions: convert a page and protection to a page entry,
- * and a page entry and page directory to the page they refer to.
- */
-#define mk_pte(page, prot)	pfn_pte(page_to_pfn(page), prot)
-
-static inline pte_t pte_modify(pte_t pte, pgprot_t newprot)
-{
-	const unsigned long mask = PTE_EXEC | PTE_WRITE | PTE_READ;
-	pte_val(pte) = (pte_val(pte) & ~mask) | (pgprot_val(newprot) & mask);
-	return pte;
-}
-
-extern pgd_t swapper_pg_dir[PTRS_PER_PGD];
-
-/*
- * Encode and decode a swap entry.  Swap entries are stored in the Linux
- * page tables as follows:
- *
- *   3 3 2 2 2 2 2 2 2 2 2 2 1 1 1 1 1 1 1 1 1 1
- *   1 0 9 8 7 6 5 4 3 2 1 0 9 8 7 6 5 4 3 2 1 0 9 8 7 6 5 4 3 2 1 0
- *   <--------------- offset --------------> <--- type --> 0 0 0 0 0
- *
- * This gives us up to 127 swap files and 32GB per swap file.  Note that
- * the offset field is always non-zero.
- */
-#define __SWP_TYPE_SHIFT	5
-#define __SWP_TYPE_BITS		7
-#define __SWP_TYPE_MASK		((1 << __SWP_TYPE_BITS) - 1)
-#define __SWP_OFFSET_SHIFT	(__SWP_TYPE_BITS + __SWP_TYPE_SHIFT)
-
-#define __swp_type(x)		(((x).val >> __SWP_TYPE_SHIFT)		\
-				& __SWP_TYPE_MASK)
-#define __swp_offset(x)		((x).val >> __SWP_OFFSET_SHIFT)
-#define __swp_entry(type, offset) ((swp_entry_t) {			\
-				((type) << __SWP_TYPE_SHIFT) |		\
-				((offset) << __SWP_OFFSET_SHIFT) })
-
-#define __pte_to_swp_entry(pte)	((swp_entry_t) { pte_val(pte) })
-#define __swp_entry_to_pte(swp)	((pte_t) { (swp).val })
-
-/*
- * It is an error for the kernel to have more swap files than we can
- * encode in the PTEs.  This ensures that we know when MAX_SWAPFILES
- * is increased beyond what we presently support.
- */
-#define MAX_SWAPFILES_CHECK()	\
-	BUILD_BUG_ON(MAX_SWAPFILES_SHIFT > __SWP_TYPE_BITS)
-
-/* Needs to be defined here and not in linux/mm.h, as it is arch dependent */
-/* FIXME: this is not correct */
-#define kern_addr_valid(addr)	(1)
-
-#endif /* !__ASSEMBLY__ */
-
-#endif /* __UNICORE_PGTABLE_H__ */
diff --git a/arch/unicore32/include/asm/processor.h b/arch/unicore32/include/asm/processor.h
deleted file mode 100644
index 6f01620da3d1..000000000000
--- a/arch/unicore32/include/asm/processor.h
+++ /dev/null
@@ -1,74 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/*
- * linux/arch/unicore32/include/asm/processor.h
- *
- * Code specific to PKUnity SoC and UniCore ISA
- *
- * Copyright (C) 2001-2010 GUAN Xue-tao
- */
-
-#ifndef __UNICORE_PROCESSOR_H__
-#define __UNICORE_PROCESSOR_H__
-
-#ifdef __KERNEL__
-
-#include <asm/ptrace.h>
-#include <asm/types.h>
-
-#ifdef __KERNEL__
-#define STACK_TOP	TASK_SIZE
-#define STACK_TOP_MAX	TASK_SIZE
-#endif
-
-struct debug_entry {
-	u32			address;
-	u32			insn;
-};
-
-struct debug_info {
-	int			nsaved;
-	struct debug_entry	bp[2];
-};
-
-struct thread_struct {
-							/* fault info	  */
-	unsigned long		address;
-	unsigned long		trap_no;
-	unsigned long		error_code;
-							/* debugging	  */
-	struct debug_info	debug;
-};
-
-#define INIT_THREAD  {	}
-
-#define start_thread(regs, pc, sp)					\
-({									\
-	unsigned long *stack = (unsigned long *)sp;			\
-	memset(regs->uregs, 0, sizeof(regs->uregs));			\
-	regs->UCreg_asr = USER_MODE;					\
-	regs->UCreg_pc = pc & ~1;	/* pc */                        \
-	regs->UCreg_sp = sp;		/* sp */                        \
-	regs->UCreg_02 = stack[2];	/* r2 (envp) */                 \
-	regs->UCreg_01 = stack[1];	/* r1 (argv) */                 \
-	regs->UCreg_00 = stack[0];	/* r0 (argc) */                 \
-})
-
-/* Forward declaration, a strange C thing */
-struct task_struct;
-
-/* Free all resources held by a thread. */
-extern void release_thread(struct task_struct *);
-
-unsigned long get_wchan(struct task_struct *p);
-
-#define cpu_relax()			barrier()
-
-#define task_pt_regs(p) \
-	((struct pt_regs *)(THREAD_START_SP + task_stack_page(p)) - 1)
-
-#define KSTK_EIP(tsk)	(task_pt_regs(tsk)->UCreg_pc)
-#define KSTK_ESP(tsk)	(task_pt_regs(tsk)->UCreg_sp)
-
-#endif
-
-#endif /* __UNICORE_PROCESSOR_H__ */
diff --git a/arch/unicore32/include/asm/ptrace.h b/arch/unicore32/include/asm/ptrace.h
deleted file mode 100644
index bb4cbc42c321..000000000000
--- a/arch/unicore32/include/asm/ptrace.h
+++ /dev/null
@@ -1,58 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/*
- * linux/arch/unicore32/include/asm/ptrace.h
- *
- * Code specific to PKUnity SoC and UniCore ISA
- *
- * Copyright (C) 2001-2010 GUAN Xue-tao
- */
-#ifndef __UNICORE_PTRACE_H__
-#define __UNICORE_PTRACE_H__
-
-#include <uapi/asm/ptrace.h>
-
-#ifndef __ASSEMBLY__
-
-#define user_mode(regs)	\
-	(processor_mode(regs) == USER_MODE)
-
-#define processor_mode(regs) \
-	((regs)->UCreg_asr & MODE_MASK)
-
-#define interrupts_enabled(regs) \
-	(!((regs)->UCreg_asr & PSR_I_BIT))
-
-#define fast_interrupts_enabled(regs) \
-	(!((regs)->UCreg_asr & PSR_R_BIT))
-
-/* Are the current registers suitable for user mode?
- * (used to maintain security in signal handlers)
- */
-static inline int valid_user_regs(struct pt_regs *regs)
-{
-	unsigned long mode = regs->UCreg_asr & MODE_MASK;
-
-	/*
-	 * Always clear the R (REAL) bits
-	 */
-	regs->UCreg_asr &= ~(PSR_R_BIT);
-
-	if ((regs->UCreg_asr & PSR_I_BIT) == 0) {
-		if (mode == USER_MODE)
-			return 1;
-	}
-
-	/*
-	 * Force ASR to something logical...
-	 */
-	regs->UCreg_asr &= PSR_f | USER_MODE;
-
-	return 0;
-}
-
-#define instruction_pointer(regs)	((regs)->UCreg_pc)
-#define user_stack_pointer(regs)	((regs)->UCreg_sp)
-#define profile_pc(regs)		instruction_pointer(regs)
-
-#endif /* __ASSEMBLY__ */
-#endif
diff --git a/arch/unicore32/include/asm/stacktrace.h b/arch/unicore32/include/asm/stacktrace.h
deleted file mode 100644
index 3e59f9d2faed..000000000000
--- a/arch/unicore32/include/asm/stacktrace.h
+++ /dev/null
@@ -1,28 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/*
- * linux/arch/unicore32/include/asm/stacktrace.h
- *
- * Code specific to PKUnity SoC and UniCore ISA
- *
- * Copyright (C) 2001-2010 GUAN Xue-tao
- */
-
-#ifndef __UNICORE_STACKTRACE_H__
-#define __UNICORE_STACKTRACE_H__
-
-struct stackframe {
-	unsigned long fp;
-	unsigned long sp;
-	unsigned long lr;
-	unsigned long pc;
-};
-
-#ifdef CONFIG_FRAME_POINTER
-extern int unwind_frame(struct stackframe *frame);
-#else
-#define unwind_frame(f) (-EINVAL)
-#endif
-extern void walk_stackframe(struct stackframe *frame,
-			    int (*fn)(struct stackframe *, void *), void *data);
-
-#endif	/* __UNICORE_STACKTRACE_H__ */
diff --git a/arch/unicore32/include/asm/string.h b/arch/unicore32/include/asm/string.h
deleted file mode 100644
index 1649b0e4271b..000000000000
--- a/arch/unicore32/include/asm/string.h
+++ /dev/null
@@ -1,35 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/*
- * linux/arch/unicore32/include/asm/string.h
- *
- * Code specific to PKUnity SoC and UniCore ISA
- *
- * Copyright (C) 2001-2010 GUAN Xue-tao
- */
-#ifndef __UNICORE_STRING_H__
-#define __UNICORE_STRING_H__
-
-/*
- * We don't do inline string functions, since the
- * optimised inline asm versions are not small.
- */
-
-#define __HAVE_ARCH_STRRCHR
-extern char *strrchr(const char *s, int c);
-
-#define __HAVE_ARCH_STRCHR
-extern char *strchr(const char *s, int c);
-
-#define __HAVE_ARCH_MEMCPY
-extern void *memcpy(void *, const void *, __kernel_size_t);
-
-#define __HAVE_ARCH_MEMMOVE
-extern void *memmove(void *, const void *, __kernel_size_t);
-
-#define __HAVE_ARCH_MEMCHR
-extern void *memchr(const void *, int, __kernel_size_t);
-
-#define __HAVE_ARCH_MEMSET
-extern void *memset(void *, int, __kernel_size_t);
-
-#endif
diff --git a/arch/unicore32/include/asm/suspend.h b/arch/unicore32/include/asm/suspend.h
deleted file mode 100644
index 72bd89c44d10..000000000000
--- a/arch/unicore32/include/asm/suspend.h
+++ /dev/null
@@ -1,26 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/*
- * linux/arch/unicore32/include/asm/suspend.h
- *
- * Code specific to PKUnity SoC and UniCore ISA
- *
- * Copyright (C) 2001-2010 GUAN Xue-tao
- */
-
-#ifndef __UNICORE_SUSPEND_H__
-#define __UNICORE_SUSPEND_H__
-
-#ifndef __ASSEMBLY__
-
-#include <asm/ptrace.h>
-
-struct swsusp_arch_regs {
-	struct cpu_context_save	cpu_context;	/* cpu context */
-#ifdef CONFIG_UNICORE_FPU_F64
-	struct fp_state		fpstate __attribute__((aligned(8)));
-#endif
-};
-#endif
-
-#endif /* __UNICORE_SUSPEND_H__ */
-
diff --git a/arch/unicore32/include/asm/switch_to.h b/arch/unicore32/include/asm/switch_to.h
deleted file mode 100644
index 12e534b3bfa5..000000000000
--- a/arch/unicore32/include/asm/switch_to.h
+++ /dev/null
@@ -1,27 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/*
- * Task switching for PKUnity SoC and UniCore ISA
- *
- * Copyright (C) 2001-2012 GUAN Xue-tao
- */
-#ifndef __UNICORE_SWITCH_TO_H__
-#define __UNICORE_SWITCH_TO_H__
-
-struct task_struct;
-struct thread_info;
-
-/*
- * switch_to(prev, next) should switch from task `prev' to `next'
- * `prev' will never be the same as `next'.  schedule() itself
- * contains the memory barrier to tell GCC not to cache `current'.
- */
-extern struct task_struct *__switch_to(struct task_struct *,
-		struct thread_info *, struct thread_info *);
-
-#define switch_to(prev, next, last)					\
-	do {								\
-		last = __switch_to(prev, task_thread_info(prev),	\
-					task_thread_info(next));	\
-	} while (0)
-
-#endif /* __UNICORE_SWITCH_TO_H__ */
diff --git a/arch/unicore32/include/asm/syscall.h b/arch/unicore32/include/asm/syscall.h
deleted file mode 100644
index 607961797fff..000000000000
--- a/arch/unicore32/include/asm/syscall.h
+++ /dev/null
@@ -1,12 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _ASM_UNICORE_SYSCALL_H
-#define _ASM_UNICORE_SYSCALL_H
-
-#include <uapi/linux/audit.h>
-
-static inline int syscall_get_arch(struct task_struct *task)
-{
-	return AUDIT_ARCH_UNICORE;
-}
-
-#endif	/* _ASM_UNICORE_SYSCALL_H */
diff --git a/arch/unicore32/include/asm/thread_info.h b/arch/unicore32/include/asm/thread_info.h
deleted file mode 100644
index d8a6d6b7a403..000000000000
--- a/arch/unicore32/include/asm/thread_info.h
+++ /dev/null
@@ -1,133 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/*
- * linux/arch/unicore32/include/asm/thread_info.h
- *
- * Code specific to PKUnity SoC and UniCore ISA
- *
- * Copyright (C) 2001-2010 GUAN Xue-tao
- */
-#ifndef __UNICORE_THREAD_INFO_H__
-#define __UNICORE_THREAD_INFO_H__
-
-#ifdef __KERNEL__
-
-#include <linux/compiler.h>
-#include <asm/fpstate.h>
-
-#define THREAD_SIZE_ORDER	1
-#define THREAD_SIZE		8192
-#define THREAD_START_SP		(THREAD_SIZE - 8)
-
-#ifndef __ASSEMBLY__
-
-struct task_struct;
-
-#include <asm/types.h>
-
-typedef struct {
-	unsigned long seg;
-} mm_segment_t;
-
-struct cpu_context_save {
-	__u32	r4;
-	__u32	r5;
-	__u32	r6;
-	__u32	r7;
-	__u32	r8;
-	__u32	r9;
-	__u32	r10;
-	__u32	r11;
-	__u32	r12;
-	__u32	r13;
-	__u32	r14;
-	__u32	r15;
-	__u32	r16;
-	__u32	r17;
-	__u32	r18;
-	__u32	r19;
-	__u32	r20;
-	__u32	r21;
-	__u32	r22;
-	__u32	r23;
-	__u32	r24;
-	__u32	r25;
-	__u32	r26;
-	__u32	fp;
-	__u32	sp;
-	__u32	pc;
-};
-
-/*
- * low level task data that entry.S needs immediate access to.
- * __switch_to() assumes cpu_context follows immediately after cpu_domain.
- */
-struct thread_info {
-	unsigned long		flags;		/* low level flags */
-	int			preempt_count;	/* 0 => preemptable */
-						/* <0 => bug */
-	mm_segment_t		addr_limit;	/* address limit */
-	struct task_struct	*task;		/* main task structure */
-	__u32			cpu;		/* cpu */
-	struct cpu_context_save	cpu_context;	/* cpu context */
-	__u32			syscall;	/* syscall number */
-	__u8			used_cp[16];	/* thread used copro */
-#ifdef CONFIG_UNICORE_FPU_F64
-	struct fp_state		fpstate __attribute__((aligned(8)));
-#endif
-};
-
-#define INIT_THREAD_INFO(tsk)						\
-{									\
-	.task		= &tsk,						\
-	.flags		= 0,						\
-	.preempt_count	= INIT_PREEMPT_COUNT,				\
-	.addr_limit	= KERNEL_DS,					\
-}
-
-/*
- * how to get the thread information struct from C
- */
-static inline struct thread_info *current_thread_info(void) __attribute_const__;
-
-static inline struct thread_info *current_thread_info(void)
-{
-	register unsigned long sp asm ("sp");
-	return (struct thread_info *)(sp & ~(THREAD_SIZE - 1));
-}
-
-#define thread_saved_pc(tsk)	\
-	((unsigned long)(task_thread_info(tsk)->cpu_context.pc))
-#define thread_saved_sp(tsk)	\
-	((unsigned long)(task_thread_info(tsk)->cpu_context.sp))
-#define thread_saved_fp(tsk)	\
-	((unsigned long)(task_thread_info(tsk)->cpu_context.fp))
-
-#endif
-
-/*
- * thread information flags:
- *  TIF_SYSCALL_TRACE	- syscall trace active
- *  TIF_SIGPENDING	- signal pending
- *  TIF_NEED_RESCHED	- rescheduling necessary
- *  TIF_NOTIFY_RESUME	- callback before returning to user
- */
-#define TIF_SIGPENDING		0
-#define TIF_NEED_RESCHED	1
-#define TIF_NOTIFY_RESUME	2	/* callback before returning to user */
-#define TIF_SYSCALL_TRACE	8
-#define TIF_MEMDIE		18
-#define TIF_RESTORE_SIGMASK	20
-
-#define _TIF_SIGPENDING		(1 << TIF_SIGPENDING)
-#define _TIF_NEED_RESCHED	(1 << TIF_NEED_RESCHED)
-#define _TIF_NOTIFY_RESUME	(1 << TIF_NOTIFY_RESUME)
-#define _TIF_SYSCALL_TRACE	(1 << TIF_SYSCALL_TRACE)
-
-/*
- * Change these and you break ASM code in entry-common.S
- */
-#define _TIF_WORK_MASK \
-	(_TIF_SIGPENDING | _TIF_NEED_RESCHED | _TIF_NOTIFY_RESUME)
-
-#endif /* __KERNEL__ */
-#endif /* __UNICORE_THREAD_INFO_H__ */
diff --git a/arch/unicore32/include/asm/timex.h b/arch/unicore32/include/asm/timex.h
deleted file mode 100644
index d714af3dbce1..000000000000
--- a/arch/unicore32/include/asm/timex.h
+++ /dev/null
@@ -1,31 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/*
- * linux/arch/unicore32/include/asm/timex.h
- *
- * Code specific to PKUnity SoC and UniCore ISA
- *
- * Copyright (C) 2001-2010 GUAN Xue-tao
- */
-
-#ifndef __UNICORE_TIMEX_H__
-#define __UNICORE_TIMEX_H__
-
-#ifdef	CONFIG_ARCH_FPGA
-
-/* in FPGA, APB clock is 33M, and OST clock is 32K, */
-/* so, 1M is selected for timer interrupt correctly */
-#define	CLOCK_TICK_RATE		(32*1024)
-
-#endif
-
-#if defined(CONFIG_PUV3_DB0913)		\
-	|| defined(CONFIG_PUV3_NB0916)	\
-	|| defined(CONFIG_PUV3_SMW0919)
-
-#define  CLOCK_TICK_RATE         (14318000)
-
-#endif
-
-#include <asm-generic/timex.h>
-
-#endif
diff --git a/arch/unicore32/include/asm/tlb.h b/arch/unicore32/include/asm/tlb.h
deleted file mode 100644
index 4663d8cc80ef..000000000000
--- a/arch/unicore32/include/asm/tlb.h
+++ /dev/null
@@ -1,24 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/*
- * linux/arch/unicore32/include/asm/tlb.h
- *
- * Code specific to PKUnity SoC and UniCore ISA
- *
- * Copyright (C) 2001-2010 GUAN Xue-tao
- */
-#ifndef __UNICORE_TLB_H__
-#define __UNICORE_TLB_H__
-
-/*
- * unicore32 lacks an efficient flush_tlb_range(), use flush_tlb_mm().
- */
-
-#define __pte_free_tlb(tlb, pte, addr)				\
-	do {							\
-		pgtable_pte_page_dtor(pte);			\
-		tlb_remove_page((tlb), (pte));			\
-	} while (0)
-
-#include <asm-generic/tlb.h>
-
-#endif
diff --git a/arch/unicore32/include/asm/tlbflush.h b/arch/unicore32/include/asm/tlbflush.h
deleted file mode 100644
index 1cf18ef55515..000000000000
--- a/arch/unicore32/include/asm/tlbflush.h
+++ /dev/null
@@ -1,192 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/*
- * linux/arch/unicore32/include/asm/tlbflush.h
- *
- * Code specific to PKUnity SoC and UniCore ISA
- *
- * Copyright (C) 2001-2010 GUAN Xue-tao
- */
-#ifndef __UNICORE_TLBFLUSH_H__
-#define __UNICORE_TLBFLUSH_H__
-
-#ifndef __ASSEMBLY__
-
-#include <linux/sched.h>
-
-extern void __cpu_flush_user_tlb_range(unsigned long, unsigned long,
-					struct vm_area_struct *);
-extern void __cpu_flush_kern_tlb_range(unsigned long, unsigned long);
-
-/*
- *	TLB Management
- *	==============
- *
- *	The arch/unicore/mm/tlb-*.S files implement these methods.
- *
- *	The TLB specific code is expected to perform whatever tests it
- *	needs to determine if it should invalidate the TLB for each
- *	call.  Start addresses are inclusive and end addresses are
- *	exclusive; it is safe to round these addresses down.
- *
- *	flush_tlb_all()
- *
- *		Invalidate the entire TLB.
- *
- *	flush_tlb_mm(mm)
- *
- *		Invalidate all TLB entries in a particular address
- *		space.
- *		- mm	- mm_struct describing address space
- *
- *	flush_tlb_range(mm,start,end)
- *
- *		Invalidate a range of TLB entries in the specified
- *		address space.
- *		- mm	- mm_struct describing address space
- *		- start - start address (may not be aligned)
- *		- end	- end address (exclusive, may not be aligned)
- *
- *	flush_tlb_page(vaddr,vma)
- *
- *		Invalidate the specified page in the specified address range.
- *		- vaddr - virtual address (may not be aligned)
- *		- vma	- vma_struct describing address range
- *
- *	flush_kern_tlb_page(kaddr)
- *
- *		Invalidate the TLB entry for the specified page.  The address
- *		will be in the kernels virtual memory space.  Current uses
- *		only require the D-TLB to be invalidated.
- *		- kaddr - Kernel virtual memory address
- */
-
-static inline void local_flush_tlb_all(void)
-{
-	const int zero = 0;
-
-	/* TLB invalidate all */
-	asm("movc p0.c6, %0, #6; nop; nop; nop; nop; nop; nop; nop; nop"
-		: : "r" (zero) : "cc");
-}
-
-static inline void local_flush_tlb_mm(struct mm_struct *mm)
-{
-	const int zero = 0;
-
-	if (cpumask_test_cpu(get_cpu(), mm_cpumask(mm))) {
-		/* TLB invalidate all */
-		asm("movc p0.c6, %0, #6; nop; nop; nop; nop; nop; nop; nop; nop"
-			: : "r" (zero) : "cc");
-	}
-	put_cpu();
-}
-
-static inline void
-local_flush_tlb_page(struct vm_area_struct *vma, unsigned long uaddr)
-{
-	if (cpumask_test_cpu(smp_processor_id(), mm_cpumask(vma->vm_mm))) {
-#ifndef CONFIG_CPU_TLB_SINGLE_ENTRY_DISABLE
-		/* iTLB invalidate page */
-		asm("movc p0.c6, %0, #5; nop; nop; nop; nop; nop; nop; nop; nop"
-			: : "r" (uaddr & PAGE_MASK) : "cc");
-		/* dTLB invalidate page */
-		asm("movc p0.c6, %0, #3; nop; nop; nop; nop; nop; nop; nop; nop"
-			: : "r" (uaddr & PAGE_MASK) : "cc");
-#else
-		/* TLB invalidate all */
-		asm("movc p0.c6, %0, #6; nop; nop; nop; nop; nop; nop; nop; nop"
-			: : "r" (uaddr & PAGE_MASK) : "cc");
-#endif
-	}
-}
-
-static inline void local_flush_tlb_kernel_page(unsigned long kaddr)
-{
-#ifndef CONFIG_CPU_TLB_SINGLE_ENTRY_DISABLE
-	/* iTLB invalidate page */
-	asm("movc p0.c6, %0, #5; nop; nop; nop; nop; nop; nop; nop; nop"
-		: : "r" (kaddr & PAGE_MASK) : "cc");
-	/* dTLB invalidate page */
-	asm("movc p0.c6, %0, #3; nop; nop; nop; nop; nop; nop; nop; nop"
-		: : "r" (kaddr & PAGE_MASK) : "cc");
-#else
-	/* TLB invalidate all */
-	asm("movc p0.c6, %0, #6; nop; nop; nop; nop; nop; nop; nop; nop"
-		: : "r" (kaddr & PAGE_MASK) : "cc");
-#endif
-}
-
-/*
- *	flush_pmd_entry
- *
- *	Flush a PMD entry (word aligned, or double-word aligned) to
- *	RAM if the TLB for the CPU we are running on requires this.
- *	This is typically used when we are creating PMD entries.
- *
- *	clean_pmd_entry
- *
- *	Clean (but don't drain the write buffer) if the CPU requires
- *	these operations.  This is typically used when we are removing
- *	PMD entries.
- */
-static inline void flush_pmd_entry(pmd_t *pmd)
-{
-#ifndef CONFIG_CPU_DCACHE_LINE_DISABLE
-	/* flush dcache line, see dcacheline_flush in proc-macros.S */
-	asm("mov	r1, %0 << #20\n"
-		"ldw	r2, =_stext\n"
-		"add	r2, r2, r1 >> #20\n"
-		"ldw	r1, [r2+], #0x0000\n"
-		"ldw	r1, [r2+], #0x1000\n"
-		"ldw	r1, [r2+], #0x2000\n"
-		"ldw	r1, [r2+], #0x3000\n"
-		: : "r" (pmd) : "r1", "r2");
-#else
-	/* flush dcache all */
-	asm("movc p0.c5, %0, #14; nop; nop; nop; nop; nop; nop; nop; nop"
-		: : "r" (pmd) : "cc");
-#endif
-}
-
-static inline void clean_pmd_entry(pmd_t *pmd)
-{
-#ifndef CONFIG_CPU_DCACHE_LINE_DISABLE
-	/* clean dcache line */
-	asm("movc p0.c5, %0, #11; nop; nop; nop; nop; nop; nop; nop; nop"
-		: : "r" (__pa(pmd) & ~(L1_CACHE_BYTES - 1)) : "cc");
-#else
-	/* clean dcache all */
-	asm("movc p0.c5, %0, #10; nop; nop; nop; nop; nop; nop; nop; nop"
-		: : "r" (pmd) : "cc");
-#endif
-}
-
-/*
- * Convert calls to our calling convention.
- */
-#define local_flush_tlb_range(vma, start, end)	\
-	__cpu_flush_user_tlb_range(start, end, vma)
-#define local_flush_tlb_kernel_range(s, e)	\
-	__cpu_flush_kern_tlb_range(s, e)
-
-#define flush_tlb_all		local_flush_tlb_all
-#define flush_tlb_mm		local_flush_tlb_mm
-#define flush_tlb_page		local_flush_tlb_page
-#define flush_tlb_kernel_page	local_flush_tlb_kernel_page
-#define flush_tlb_range		local_flush_tlb_range
-#define flush_tlb_kernel_range	local_flush_tlb_kernel_range
-
-/*
- * if PG_dcache_clean is not set for the page, we need to ensure that any
- * cache entries for the kernels virtual memory range are written
- * back to the page.
- */
-extern void update_mmu_cache(struct vm_area_struct *vma,
-		unsigned long addr, pte_t *ptep);
-
-extern void do_bad_area(unsigned long addr, unsigned int fsr,
-		struct pt_regs *regs);
-
-#endif
-
-#endif
diff --git a/arch/unicore32/include/asm/traps.h b/arch/unicore32/include/asm/traps.h
deleted file mode 100644
index ad1508a9a903..000000000000
--- a/arch/unicore32/include/asm/traps.h
+++ /dev/null
@@ -1,18 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/*
- * linux/arch/unicore32/include/asm/traps.h
- *
- * Code specific to PKUnity SoC and UniCore ISA
- *
- * Copyright (C) 2001-2010 GUAN Xue-tao
- */
-#ifndef __UNICORE_TRAP_H__
-#define __UNICORE_TRAP_H__
-
-extern void __init early_trap_init(void);
-extern void dump_backtrace_entry(unsigned long where,
-		unsigned long from, unsigned long frame);
-
-extern void do_DataAbort(unsigned long addr, unsigned int fsr,
-		 struct pt_regs *regs);
-#endif
diff --git a/arch/unicore32/include/asm/uaccess.h b/arch/unicore32/include/asm/uaccess.h
deleted file mode 100644
index 33c24f430511..000000000000
--- a/arch/unicore32/include/asm/uaccess.h
+++ /dev/null
@@ -1,38 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/*
- * linux/arch/unicore32/include/asm/uaccess.h
- *
- * Code specific to PKUnity SoC and UniCore ISA
- *
- * Copyright (C) 2001-2010 GUAN Xue-tao
- */
-#ifndef __UNICORE_UACCESS_H__
-#define __UNICORE_UACCESS_H__
-
-#include <asm/memory.h>
-
-#define __strncpy_from_user	__strncpy_from_user
-#define __strnlen_user		__strnlen_user
-#define __clear_user		__clear_user
-
-#define __kernel_ok		(uaccess_kernel())
-#define __user_ok(addr, size)	(((size) <= TASK_SIZE)			\
-				&& ((addr) <= TASK_SIZE - (size)))
-#define __access_ok(addr, size)	(__kernel_ok || __user_ok((addr), (size)))
-
-extern unsigned long __must_check
-raw_copy_from_user(void *to, const void __user *from, unsigned long n);
-extern unsigned long __must_check
-raw_copy_to_user(void __user *to, const void *from, unsigned long n);
-extern unsigned long __must_check
-__clear_user(void __user *addr, unsigned long n);
-extern unsigned long __must_check
-__strncpy_from_user(char *to, const char __user *from, unsigned long count);
-extern unsigned long
-__strnlen_user(const char __user *s, long n);
-#define INLINE_COPY_FROM_USER
-#define INLINE_COPY_TO_USER
-
-#include <asm-generic/uaccess.h>
-
-#endif /* __UNICORE_UACCESS_H__ */
diff --git a/arch/unicore32/include/asm/vmalloc.h b/arch/unicore32/include/asm/vmalloc.h
deleted file mode 100644
index 054435818a14..000000000000
--- a/arch/unicore32/include/asm/vmalloc.h
+++ /dev/null
@@ -1,4 +0,0 @@
-#ifndef _ASM_UNICORE32_VMALLOC_H
-#define _ASM_UNICORE32_VMALLOC_H
-
-#endif /* _ASM_UNICORE32_VMALLOC_H */
diff --git a/arch/unicore32/include/mach/PKUnity.h b/arch/unicore32/include/mach/PKUnity.h
deleted file mode 100644
index 78f77517c1c7..000000000000
--- a/arch/unicore32/include/mach/PKUnity.h
+++ /dev/null
@@ -1,95 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/*
- * linux/arch/unicore32/include/mach/PKUnity.h
- *
- * Code specific to PKUnity SoC and UniCore ISA
- *
- * Copyright (C) 2001-2010 GUAN Xue-tao
- */
-
-/* Be sure that virtual mapping is defined right */
-#ifndef __MACH_PUV3_HARDWARE_H__
-#error You must include hardware.h not PKUnity.h
-#endif
-
-#include <mach/bitfield.h>
-
-/*
- * Memory Definitions
- */
-#define PKUNITY_SDRAM_BASE		0x00000000 /* 0x00000000 - 0x7FFFFFFF 2GB */
-#define PKUNITY_MMIO_BASE		0x80000000 /* 0x80000000 - 0xFFFFFFFF 2GB */
-
-/*
- * PKUNITY System Bus Addresses (PCI): 0x80000000 - 0xBFFFFFFF (1GB)
- * 0x80000000 - 0x8000000B 12B    PCI Configuration regs
- * 0x80010000 - 0x80010250 592B   PCI Bridge Base
- * 0x80030000 - 0x8003FFFF 64KB   PCI Legacy IO
- * 0x90000000 - 0x97FFFFFF 128MB  PCI AHB-PCI MEM-mapping
- * 0x98000000 - 0x9FFFFFFF 128MB  PCI PCI-AHB MEM-mapping
- */
-#define PKUNITY_PCI_BASE		io_p2v(0x80000000) /* 0x80000000 - 0xBFFFFFFF 1GB */
-#include <mach/regs-pci.h>
-
-#define PKUNITY_PCICFG_BASE		(PKUNITY_PCI_BASE + 0x0)
-#define PKUNITY_PCIBRI_BASE		(PKUNITY_PCI_BASE + 0x00010000)
-#define PKUNITY_PCILIO_BASE		(PKUNITY_PCI_BASE + 0x00030000)
-#define PKUNITY_PCIMEM_BASE		(PKUNITY_PCI_BASE + 0x10000000)
-#define PKUNITY_PCIAHB_BASE		(PKUNITY_PCI_BASE + 0x18000000)
-
-/*
- * PKUNITY System Bus Addresses (AHB): 0xC0000000 - 0xEDFFFFFF (640MB)
- */
-#define PKUNITY_AHB_BASE		io_p2v(0xC0000000)
-
-/* AHB-0 is DDR2 SDRAM */
-/* AHB-1 is PCI Space */
-#define PKUNITY_ARBITER_BASE		(PKUNITY_AHB_BASE + 0x000000) /* AHB-2 */
-#define PKUNITY_DDR2CTRL_BASE		(PKUNITY_AHB_BASE + 0x100000) /* AHB-3 */
-#define PKUNITY_DMAC_BASE		(PKUNITY_AHB_BASE + 0x200000) /* AHB-4 */
-#include <mach/regs-dmac.h>
-#define PKUNITY_UMAL_BASE		(PKUNITY_AHB_BASE + 0x300000) /* AHB-5 */
-#include <mach/regs-umal.h>
-#define PKUNITY_USB_BASE		(PKUNITY_AHB_BASE + 0x400000) /* AHB-6 */
-#define PKUNITY_SATA_BASE		(PKUNITY_AHB_BASE + 0x500000) /* AHB-7 */
-#define PKUNITY_SMC_BASE		(PKUNITY_AHB_BASE + 0x600000) /* AHB-8 */
-/* AHB-9 is for APB bridge */
-#define PKUNITY_MME_BASE		(PKUNITY_AHB_BASE + 0x700000) /* AHB-10 */
-#define PKUNITY_UNIGFX_BASE		(PKUNITY_AHB_BASE + 0x800000) /* AHB-11 */
-#include <mach/regs-unigfx.h>
-#define PKUNITY_NAND_BASE		(PKUNITY_AHB_BASE + 0x900000) /* AHB-12 */
-#include <mach/regs-nand.h>
-#define PKUNITY_H264D_BASE		(PKUNITY_AHB_BASE + 0xA00000) /* AHB-13 */
-#define PKUNITY_H264E_BASE		(PKUNITY_AHB_BASE + 0xB00000) /* AHB-14 */
-
-/*
- * PKUNITY Peripheral Bus Addresses (APB): 0xEE000000 - 0xEFFFFFFF (128MB)
- */
-#define PKUNITY_APB_BASE		io_p2v(0xEE000000)
-
-#define PKUNITY_UART0_BASE		(PKUNITY_APB_BASE + 0x000000) /* APB-0 */
-#define PKUNITY_UART1_BASE		(PKUNITY_APB_BASE + 0x100000) /* APB-1 */
-#include <mach/regs-uart.h>
-#define PKUNITY_I2C_BASE		(PKUNITY_APB_BASE + 0x200000) /* APB-2 */
-#include <mach/regs-i2c.h>
-#define PKUNITY_SPI_BASE		(PKUNITY_APB_BASE + 0x300000) /* APB-3 */
-#include <mach/regs-spi.h>
-#define PKUNITY_AC97_BASE		(PKUNITY_APB_BASE + 0x400000) /* APB-4 */
-#include <mach/regs-ac97.h>
-#define PKUNITY_GPIO_BASE		(PKUNITY_APB_BASE + 0x500000) /* APB-5 */
-#include <mach/regs-gpio.h>
-#define PKUNITY_INTC_BASE		(PKUNITY_APB_BASE + 0x600000) /* APB-6 */
-#include <mach/regs-intc.h>
-#define PKUNITY_RTC_BASE		(PKUNITY_APB_BASE + 0x700000) /* APB-7 */
-#include <mach/regs-rtc.h>
-#define PKUNITY_OST_BASE		(PKUNITY_APB_BASE + 0x800000) /* APB-8 */
-#include <mach/regs-ost.h>
-#define PKUNITY_RESETC_BASE		(PKUNITY_APB_BASE + 0x900000) /* APB-9 */
-#include <mach/regs-resetc.h>
-#define PKUNITY_PM_BASE			(PKUNITY_APB_BASE + 0xA00000) /* APB-10 */
-#include <mach/regs-pm.h>
-#define PKUNITY_PS2_BASE		(PKUNITY_APB_BASE + 0xB00000) /* APB-11 */
-#include <mach/regs-ps2.h>
-#define PKUNITY_SDC_BASE		(PKUNITY_APB_BASE + 0xC00000) /* APB-12 */
-#include <mach/regs-sdc.h>
-
diff --git a/arch/unicore32/include/mach/bitfield.h b/arch/unicore32/include/mach/bitfield.h
deleted file mode 100644
index 766b7f01f1cd..000000000000
--- a/arch/unicore32/include/mach/bitfield.h
+++ /dev/null
@@ -1,21 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/*
- * linux/arch/unicore32/include/mach/bitfield.h
- *
- * Code specific to PKUnity SoC and UniCore ISA
- *
- * Copyright (C) 2001-2010 GUAN Xue-tao
- */
-#ifndef __MACH_PUV3_BITFIELD_H__
-#define __MACH_PUV3_BITFIELD_H__
-
-#ifndef __ASSEMBLY__
-#define UData(Data)	((unsigned long) (Data))
-#else
-#define UData(Data)	(Data)
-#endif
-
-#define FIELD(val, vmask, vshift)	(((val) & ((UData(1) << (vmask)) - 1)) << (vshift))
-#define FMASK(vmask, vshift)		(((UData(1) << (vmask)) - 1) << (vshift))
-
-#endif /* __MACH_PUV3_BITFIELD_H__ */
diff --git a/arch/unicore32/include/mach/dma.h b/arch/unicore32/include/mach/dma.h
deleted file mode 100644
index 271001cd13c4..000000000000
--- a/arch/unicore32/include/mach/dma.h
+++ /dev/null
@@ -1,45 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/*
- * linux/arch/unicore32/include/mach/dma.h
- *
- * Code specific to PKUnity SoC and UniCore ISA
- *
- * Copyright (C) 2001-2010 GUAN Xue-tao
- */
-#ifndef __MACH_PUV3_DMA_H__
-#define __MACH_PUV3_DMA_H__
-
-/*
- * The PKUnity has six internal DMA channels.
- */
-#define MAX_DMA_CHANNELS	6
-
-typedef enum {
-	DMA_PRIO_HIGH = 0,
-	DMA_PRIO_MEDIUM = 1,
-	DMA_PRIO_LOW = 2
-} puv3_dma_prio;
-
-/*
- * DMA registration
- */
-
-extern int puv3_request_dma(char *name,
-			 puv3_dma_prio prio,
-			 void (*irq_handler)(int, void *),
-			 void (*err_handler)(int, void *),
-			 void *data);
-
-extern void puv3_free_dma(int dma_ch);
-
-static inline void puv3_stop_dma(int ch)
-{
-	writel(readl(DMAC_CONFIG(ch)) & ~DMAC_CONFIG_EN, DMAC_CONFIG(ch));
-}
-
-static inline void puv3_resume_dma(int ch)
-{
-	writel(readl(DMAC_CONFIG(ch)) | DMAC_CONFIG_EN, DMAC_CONFIG(ch));
-}
-
-#endif /* __MACH_PUV3_DMA_H__ */
diff --git a/arch/unicore32/include/mach/hardware.h b/arch/unicore32/include/mach/hardware.h
deleted file mode 100644
index 2d7571cbd1d0..000000000000
--- a/arch/unicore32/include/mach/hardware.h
+++ /dev/null
@@ -1,30 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/*
- * linux/arch/unicore32/include/mach/hardware.h
- *
- * Code specific to PKUnity SoC and UniCore ISA
- *
- * Copyright (C) 2001-2010 GUAN Xue-tao
- *
- * This file contains the hardware definitions for PKUnity architecture
- */
-
-#ifndef __MACH_PUV3_HARDWARE_H__
-#define __MACH_PUV3_HARDWARE_H__
-
-#include <mach/PKUnity.h>
-
-#ifndef __ASSEMBLY__
-#define io_p2v(x)	(void __iomem *)((x) - PKUNITY_MMIO_BASE)
-#define io_v2p(x)	(phys_addr_t)((x) + PKUNITY_MMIO_BASE)
-#else
-#define io_p2v(x)	((x) - PKUNITY_MMIO_BASE)
-#define io_v2p(x)	((x) + PKUNITY_MMIO_BASE)
-#endif
-
-#define PCIBIOS_MIN_IO			0x4000 /* should lower than 64KB */
-#define PCIBIOS_MIN_MEM			io_v2p(PKUNITY_PCIMEM_BASE)
-
-#define pcibios_assign_all_busses()	1
-
-#endif  /* __MACH_PUV3_HARDWARE_H__ */
diff --git a/arch/unicore32/include/mach/map.h b/arch/unicore32/include/mach/map.h
deleted file mode 100644
index 7a83eeeb1287..000000000000
--- a/arch/unicore32/include/mach/map.h
+++ /dev/null
@@ -1,17 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/*
- * linux/arch/unicore32/include/mach/map.h
- *
- * Code specific to PKUnity SoC and UniCore ISA
- *
- * Copyright (C) 2001-2010 GUAN Xue-tao
- *
- *  Page table mapping constructs and function prototypes
- */
-#define MT_DEVICE		0
-#define MT_DEVICE_CACHED	2
-#define MT_KUSER		7
-#define MT_HIGH_VECTORS		8
-#define MT_MEMORY		9
-#define MT_ROM			10
-
diff --git a/arch/unicore32/include/mach/memory.h b/arch/unicore32/include/mach/memory.h
deleted file mode 100644
index b4e6035cb9a3..000000000000
--- a/arch/unicore32/include/mach/memory.h
+++ /dev/null
@@ -1,54 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/*
- * linux/arch/unicore32/include/mach/memory.h
- *
- * Code specific to PKUnity SoC and UniCore ISA
- *
- * Copyright (C) 2001-2010 GUAN Xue-tao
- */
-#ifndef __MACH_PUV3_MEMORY_H__
-#define __MACH_PUV3_MEMORY_H__
-
-#include <mach/hardware.h>
-
-/* Physical DRAM offset. */
-#define PHYS_OFFSET	UL(0x00000000)
-/* The base address of exception vectors. */
-#define VECTORS_BASE	UL(0xffff0000)
-/* The base address of kuser area. */
-#define KUSER_BASE	UL(0x80000000)
-
-#ifdef __ASSEMBLY__
-/* The byte offset of the kernel image in RAM from the start of RAM. */
-#define KERNEL_IMAGE_START	0x00408000
-#endif
-
-#if !defined(__ASSEMBLY__) && defined(CONFIG_PCI)
-
-void puv3_pci_adjust_zones(unsigned long *max_zone_pfn);
-
-#define arch_adjust_zones(max_zone_pfn) \
-	puv3_pci_adjust_zones(max_zone_pfn)
-
-#endif
-
-/*
- * PCI controller in PKUnity-3 masks highest 5-bit for upstream channel,
- * so we must limit the DMA allocation within 128M physical memory for
- * supporting PCI devices.
- */
-#define PCI_DMA_THRESHOLD	(PHYS_OFFSET + SZ_128M - 1)
-
-#define is_pcibus_device(dev)	(dev &&			\
-				(strncmp(dev->bus->name, "pci", 3) == 0))
-
-#define __virt_to_pcibus(x)     (__virt_to_phys((x) + PKUNITY_PCIAHB_BASE))
-#define __pcibus_to_virt(x)     (__phys_to_virt(x) - PKUNITY_PCIAHB_BASE)
-
-/* kuser area */
-#define KUSER_VECPAGE_BASE	(KUSER_BASE + UL(0x3fff0000))
-/* kuser_vecpage (0xbfff0000) is ro, and vectors page (0xffff0000) is rw */
-#define kuser_vecpage_to_vectors(x)	((x) - (KUSER_VECPAGE_BASE)	\
-					+ (VECTORS_BASE))
-
-#endif
diff --git a/arch/unicore32/include/mach/ocd.h b/arch/unicore32/include/mach/ocd.h
deleted file mode 100644
index 2a814929e389..000000000000
--- a/arch/unicore32/include/mach/ocd.h
+++ /dev/null
@@ -1,33 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/*
- * linux/arch/unicore32/include/mach/ocd.h
- *
- * Code specific to PKUnity SoC and UniCore ISA
- *
- * Copyright (C) 2001-2010 GUAN Xue-tao
- */
-
-#ifndef __MACH_PUV3_OCD_H__
-#define __MACH_PUV3_OCD_H__
-
-#if defined(CONFIG_DEBUG_OCD)
-static inline void ocd_putc(unsigned int c)
-{
-	int status, i = 0x2000000;
-
-	do {
-		if (--i < 0)
-			return;
-
-		asm volatile ("movc %0, p1.c0, #0" : "=r" (status));
-	} while (status & 2);
-
-	asm("movc p1.c1, %0, #1" : : "r" (c));
-}
-
-#define putc(ch)	ocd_putc(ch)
-#else
-#define putc(ch)
-#endif
-
-#endif
diff --git a/arch/unicore32/include/mach/pm.h b/arch/unicore32/include/mach/pm.h
deleted file mode 100644
index cb40b8490a57..000000000000
--- a/arch/unicore32/include/mach/pm.h
+++ /dev/null
@@ -1,37 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/*
- * linux/arch/unicore/include/mach/pm.h
- *
- * Code specific to PKUnity SoC and UniCore ISA
- *
- * Copyright (C) 2001-2010 GUAN Xue-tao
- */
-#ifndef __PUV3_PM_H__
-#define __PUV3_PM_H__
-
-#include <linux/suspend.h>
-
-struct puv3_cpu_pm_fns {
-	int	save_count;
-	void	(*save)(unsigned long *);
-	void	(*restore)(unsigned long *);
-	int	(*valid)(suspend_state_t state);
-	void	(*enter)(suspend_state_t state);
-	int	(*prepare)(void);
-	void	(*finish)(void);
-};
-
-extern struct puv3_cpu_pm_fns *puv3_cpu_pm_fns;
-
-/* sleep.S */
-extern void puv3_cpu_suspend(unsigned int);
-
-extern void puv3_cpu_resume(void);
-
-extern int puv3_pm_enter(suspend_state_t state);
-
-/* Defined in hibernate_asm.S */
-extern int restore_image(pgd_t *resume_pg_dir, struct pbe *restore_pblist);
-
-extern struct pbe *restore_pblist;
-#endif
diff --git a/arch/unicore32/include/mach/regs-ac97.h b/arch/unicore32/include/mach/regs-ac97.h
deleted file mode 100644
index 85c601898d02..000000000000
--- a/arch/unicore32/include/mach/regs-ac97.h
+++ /dev/null
@@ -1,33 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * PKUnity AC97 Registers
- */
-
-#define PKUNITY_AC97_CONR		(PKUNITY_AC97_BASE + 0x0000)
-#define PKUNITY_AC97_OCR		(PKUNITY_AC97_BASE + 0x0004)
-#define PKUNITY_AC97_ICR		(PKUNITY_AC97_BASE + 0x0008)
-#define PKUNITY_AC97_CRAC		(PKUNITY_AC97_BASE + 0x000C)
-#define PKUNITY_AC97_INTR		(PKUNITY_AC97_BASE + 0x0010)
-#define PKUNITY_AC97_INTRSTAT		(PKUNITY_AC97_BASE + 0x0014)
-#define PKUNITY_AC97_INTRCLEAR		(PKUNITY_AC97_BASE + 0x0018)
-#define PKUNITY_AC97_ENABLE		(PKUNITY_AC97_BASE + 0x001C)
-#define PKUNITY_AC97_OUT_FIFO		(PKUNITY_AC97_BASE + 0x0020)
-#define PKUNITY_AC97_IN_FIFO		(PKUNITY_AC97_BASE + 0x0030)
-
-#define AC97_CODEC_REG(v)               FIELD((v), 7, 16)
-#define AC97_CODEC_VAL(v)               FIELD((v), 16, 0)
-#define AC97_CODEC_WRITECOMPLETE        FIELD(1, 1, 2)
-
-/*
- * VAR PLAY SAMPLE RATE
- */
-#define AC97_CMD_VPSAMPLE		(FIELD(3, 2, 16) | FIELD(3, 2, 0))
-
-/*
- * FIX CAPTURE SAMPLE RATE
- */
-#define AC97_CMD_FCSAMPLE		FIELD(7, 3, 0)
-
-#define AC97_CMD_RESET			FIELD(1, 1, 0)
-#define AC97_CMD_ENABLE			FIELD(1, 1, 0)
-#define AC97_CMD_DISABLE		FIELD(0, 1, 0)
diff --git a/arch/unicore32/include/mach/regs-dmac.h b/arch/unicore32/include/mach/regs-dmac.h
deleted file mode 100644
index bbdc52d06a98..000000000000
--- a/arch/unicore32/include/mach/regs-dmac.h
+++ /dev/null
@@ -1,82 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * PKUnity Direct Memory Access Controller (DMAC)
- */
-
-/*
- * Interrupt Status Reg DMAC_ISR.
- */
-#define DMAC_ISR		(PKUNITY_DMAC_BASE + 0x0020)
-/*
- * Interrupt Transfer Complete Status Reg DMAC_ITCSR.
- */
-#define DMAC_ITCSR		(PKUNITY_DMAC_BASE + 0x0050)
-/*
- * Interrupt Transfer Complete Clear Reg DMAC_ITCCR.
- */
-#define DMAC_ITCCR		(PKUNITY_DMAC_BASE + 0x0060)
-/*
- * Interrupt Error Status Reg DMAC_IESR.
- */
-#define DMAC_IESR		(PKUNITY_DMAC_BASE + 0x0080)
-/*
- * Interrupt Error Clear Reg DMAC_IECR.
- */
-#define DMAC_IECR		(PKUNITY_DMAC_BASE + 0x0090)
-/*
- * Enable Channels Reg DMAC_ENCH.
- */
-#define DMAC_ENCH		(PKUNITY_DMAC_BASE + 0x00B0)
-
-/*
- * DMA control reg. Space [byte]
- */
-#define DMASp                   0x00000100
-
-/*
- * Source Addr DMAC_SRCADDR(ch).
- */
-#define DMAC_SRCADDR(ch)	(PKUNITY_DMAC_BASE + (ch)*DMASp + 0x00)
-/*
- * Destination Addr DMAC_DESTADDR(ch).
- */
-#define DMAC_DESTADDR(ch)	(PKUNITY_DMAC_BASE + (ch)*DMASp + 0x04)
-/*
- * Control Reg DMAC_CONTROL(ch).
- */
-#define DMAC_CONTROL(ch)	(PKUNITY_DMAC_BASE + (ch)*DMASp + 0x0C)
-/*
- * Configuration Reg DMAC_CONFIG(ch).
- */
-#define DMAC_CONFIG(ch)		(PKUNITY_DMAC_BASE + (ch)*DMASp + 0x10)
-
-#define DMAC_IR_MASK            FMASK(6, 0)
-/*
- * select channel (ch)
- */
-#define DMAC_CHANNEL(ch)	FIELD(1, 1, (ch))
-
-#define DMAC_CONTROL_SIZE_BYTE(v)       (FIELD((v), 12, 14) | \
-					FIELD(0, 3, 9) | FIELD(0, 3, 6))
-#define DMAC_CONTROL_SIZE_HWORD(v)      (FIELD((v) >> 1, 12, 14) | \
-					FIELD(1, 3, 9) | FIELD(1, 3, 6))
-#define DMAC_CONTROL_SIZE_WORD(v)       (FIELD((v) >> 2, 12, 14) | \
-					FIELD(2, 3, 9) | FIELD(2, 3, 6))
-#define DMAC_CONTROL_DI                 FIELD(1, 1, 13)
-#define DMAC_CONTROL_SI                 FIELD(1, 1, 12)
-#define DMAC_CONTROL_BURST_1BYTE        (FIELD(0, 3, 3) | FIELD(0, 3, 0))
-#define DMAC_CONTROL_BURST_4BYTE        (FIELD(3, 3, 3) | FIELD(3, 3, 0))
-#define DMAC_CONTROL_BURST_8BYTE        (FIELD(5, 3, 3) | FIELD(5, 3, 0))
-#define DMAC_CONTROL_BURST_16BYTE       (FIELD(7, 3, 3) | FIELD(7, 3, 0))
-
-#define	DMAC_CONFIG_UART0_WR    (FIELD(2, 4, 11) | FIELD(1, 2, 1))
-#define	DMAC_CONFIG_UART0_RD    (FIELD(2, 4, 7)  | FIELD(2, 2, 1))
-#define	DMAC_CONFIG_UART1_WR    (FIELD(3, 4, 11) | FIELD(1, 2, 1))
-#define	DMAC_CONFIG_UART1RD     (FIELD(3, 4, 7)  | FIELD(2, 2, 1))
-#define	DMAC_CONFIG_AC97WR      (FIELD(4, 4, 11) | FIELD(1, 2, 1))
-#define	DMAC_CONFIG_AC97RD      (FIELD(4, 4, 7)  | FIELD(2, 2, 1))
-#define	DMAC_CONFIG_MMCWR       (FIELD(7, 4, 11) | FIELD(1, 2, 1))
-#define	DMAC_CONFIG_MMCRD       (FIELD(7, 4, 7)  | FIELD(2, 2, 1))
-#define DMAC_CONFIG_MASKITC     FIELD(1, 1, 4)
-#define DMAC_CONFIG_MASKIE      FIELD(1, 1, 3)
-#define DMAC_CONFIG_EN          FIELD(1, 1, 0)
diff --git a/arch/unicore32/include/mach/regs-gpio.h b/arch/unicore32/include/mach/regs-gpio.h
deleted file mode 100644
index 5fc701ee33e3..000000000000
--- a/arch/unicore32/include/mach/regs-gpio.h
+++ /dev/null
@@ -1,71 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * PKUnity General-Purpose Input/Output (GPIO) Registers
- */
-
-/*
- * Voltage Status Reg GPIO_GPLR.
- */
-#define GPIO_GPLR	(PKUNITY_GPIO_BASE + 0x0000)
-/*
- * Pin Direction Reg GPIO_GPDR.
- */
-#define GPIO_GPDR	(PKUNITY_GPIO_BASE + 0x0004)
-/*
- * Output Pin Set Reg GPIO_GPSR.
- */
-#define GPIO_GPSR	(PKUNITY_GPIO_BASE + 0x0008)
-/*
- * Output Pin Clear Reg GPIO_GPCR.
- */
-#define GPIO_GPCR	(PKUNITY_GPIO_BASE + 0x000C)
-/*
- * Raise Edge Detect Reg GPIO_GRER.
- */
-#define GPIO_GRER	(PKUNITY_GPIO_BASE + 0x0010)
-/*
- * Fall Edge Detect Reg GPIO_GFER.
- */
-#define GPIO_GFER	(PKUNITY_GPIO_BASE + 0x0014)
-/*
- * Edge Status Reg GPIO_GEDR.
- */
-#define GPIO_GEDR	(PKUNITY_GPIO_BASE + 0x0018)
-/*
- * Special Voltage Detect Reg GPIO_GPIR.
- */
-#define GPIO_GPIR	(PKUNITY_GPIO_BASE + 0x0020)
-
-#define GPIO_MIN	(0)
-#define GPIO_MAX	(27)
-
-#define GPIO_GPIO(Nb)	(0x00000001 << (Nb))	/* GPIO [0..27] */
-#define GPIO_GPIO0	GPIO_GPIO(0)	/* GPIO  [0] */
-#define GPIO_GPIO1	GPIO_GPIO(1)	/* GPIO  [1] */
-#define GPIO_GPIO2	GPIO_GPIO(2)	/* GPIO  [2] */
-#define GPIO_GPIO3	GPIO_GPIO(3)	/* GPIO  [3] */
-#define GPIO_GPIO4	GPIO_GPIO(4)	/* GPIO  [4] */
-#define GPIO_GPIO5	GPIO_GPIO(5)	/* GPIO  [5] */
-#define GPIO_GPIO6	GPIO_GPIO(6)	/* GPIO  [6] */
-#define GPIO_GPIO7	GPIO_GPIO(7)	/* GPIO  [7] */
-#define GPIO_GPIO8	GPIO_GPIO(8)	/* GPIO  [8] */
-#define GPIO_GPIO9	GPIO_GPIO(9)	/* GPIO  [9] */
-#define GPIO_GPIO10	GPIO_GPIO(10)	/* GPIO [10] */
-#define GPIO_GPIO11	GPIO_GPIO(11)	/* GPIO [11] */
-#define GPIO_GPIO12	GPIO_GPIO(12)	/* GPIO [12] */
-#define GPIO_GPIO13	GPIO_GPIO(13)	/* GPIO [13] */
-#define GPIO_GPIO14	GPIO_GPIO(14)	/* GPIO [14] */
-#define GPIO_GPIO15	GPIO_GPIO(15)	/* GPIO [15] */
-#define GPIO_GPIO16	GPIO_GPIO(16)	/* GPIO [16] */
-#define GPIO_GPIO17	GPIO_GPIO(17)	/* GPIO [17] */
-#define GPIO_GPIO18	GPIO_GPIO(18)	/* GPIO [18] */
-#define GPIO_GPIO19	GPIO_GPIO(19)	/* GPIO [19] */
-#define GPIO_GPIO20	GPIO_GPIO(20)	/* GPIO [20] */
-#define GPIO_GPIO21	GPIO_GPIO(21)	/* GPIO [21] */
-#define GPIO_GPIO22	GPIO_GPIO(22)	/* GPIO [22] */
-#define GPIO_GPIO23	GPIO_GPIO(23)	/* GPIO [23] */
-#define GPIO_GPIO24	GPIO_GPIO(24)	/* GPIO [24] */
-#define GPIO_GPIO25	GPIO_GPIO(25)	/* GPIO [25] */
-#define GPIO_GPIO26	GPIO_GPIO(26)	/* GPIO [26] */
-#define GPIO_GPIO27	GPIO_GPIO(27)	/* GPIO [27] */
-
diff --git a/arch/unicore32/include/mach/regs-i2c.h b/arch/unicore32/include/mach/regs-i2c.h
deleted file mode 100644
index b41aa7c92430..000000000000
--- a/arch/unicore32/include/mach/regs-i2c.h
+++ /dev/null
@@ -1,64 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * PKUnity Inter-integrated Circuit (I2C) Registers
- */
-
-/*
- * Control Reg I2C_CON.
- */
-#define I2C_CON		(PKUNITY_I2C_BASE + 0x0000)
-/*
- * Target Address Reg I2C_TAR.
- */
-#define I2C_TAR		(PKUNITY_I2C_BASE + 0x0004)
-/*
- * Data buffer and command Reg I2C_DATACMD.
- */
-#define I2C_DATACMD	(PKUNITY_I2C_BASE + 0x0010)
-/*
- * Enable Reg I2C_ENABLE.
- */
-#define I2C_ENABLE	(PKUNITY_I2C_BASE + 0x006C)
-/*
- * Status Reg I2C_STATUS.
- */
-#define I2C_STATUS	(PKUNITY_I2C_BASE + 0x0070)
-/*
- * Tx FIFO Length Reg I2C_TXFLR.
- */
-#define I2C_TXFLR	(PKUNITY_I2C_BASE + 0x0074)
-/*
- * Rx FIFO Length Reg I2C_RXFLR.
- */
-#define I2C_RXFLR	(PKUNITY_I2C_BASE + 0x0078)
-/*
- * Enable Status Reg I2C_ENSTATUS.
- */
-#define I2C_ENSTATUS	(PKUNITY_I2C_BASE + 0x009C)
-
-#define I2C_CON_MASTER          FIELD(1, 1, 0)
-#define I2C_CON_SPEED_STD       FIELD(1, 2, 1)
-#define I2C_CON_SPEED_FAST      FIELD(2, 2, 1)
-#define I2C_CON_RESTART         FIELD(1, 1, 5)
-#define I2C_CON_SLAVEDISABLE    FIELD(1, 1, 6)
-
-#define I2C_DATACMD_READ        FIELD(1, 1, 8)
-#define I2C_DATACMD_WRITE       FIELD(0, 1, 8)
-#define I2C_DATACMD_DAT_MASK    FMASK(8, 0)
-#define I2C_DATACMD_DAT(v)      FIELD((v), 8, 0)
-
-#define I2C_ENABLE_ENABLE       FIELD(1, 1, 0)
-#define I2C_ENABLE_DISABLE      FIELD(0, 1, 0)
-
-#define I2C_STATUS_RFF          FIELD(1, 1, 4)
-#define I2C_STATUS_RFNE         FIELD(1, 1, 3)
-#define I2C_STATUS_TFE          FIELD(1, 1, 2)
-#define I2C_STATUS_TFNF         FIELD(1, 1, 1)
-#define I2C_STATUS_ACTIVITY     FIELD(1, 1, 0)
-
-#define I2C_ENSTATUS_ENABLE	FIELD(1, 1, 0)
-
-#define I2C_TAR_THERMAL	0x4f
-#define I2C_TAR_SPD	0x50
-#define I2C_TAR_PWIC    0x55
-#define I2C_TAR_EEPROM	0x57
diff --git a/arch/unicore32/include/mach/regs-intc.h b/arch/unicore32/include/mach/regs-intc.h
deleted file mode 100644
index 4eb1b5b571bb..000000000000
--- a/arch/unicore32/include/mach/regs-intc.h
+++ /dev/null
@@ -1,29 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * PKUNITY Interrupt Controller (INTC) Registers
- */
-/*
- * INTC Level Reg INTC_ICLR.
- */
-#define INTC_ICLR	(PKUNITY_INTC_BASE + 0x0000)
-/*
- * INTC Mask Reg INTC_ICMR.
- */
-#define INTC_ICMR	(PKUNITY_INTC_BASE + 0x0004)
-/*
- * INTC Pending Reg INTC_ICPR.
- */
-#define INTC_ICPR	(PKUNITY_INTC_BASE + 0x0008)
-/*
- * INTC IRQ Pending Reg INTC_ICIP.
- */
-#define INTC_ICIP	(PKUNITY_INTC_BASE + 0x000C)
-/*
- * INTC REAL Pending Reg INTC_ICFP.
- */
-#define INTC_ICFP	(PKUNITY_INTC_BASE + 0x0010)
-/*
- * INTC Control Reg INTC_ICCR.
- */
-#define INTC_ICCR	(PKUNITY_INTC_BASE + 0x0014)
-
diff --git a/arch/unicore32/include/mach/regs-nand.h b/arch/unicore32/include/mach/regs-nand.h
deleted file mode 100644
index 7f29939251ef..000000000000
--- a/arch/unicore32/include/mach/regs-nand.h
+++ /dev/null
@@ -1,80 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * PKUnity NAND Controller Registers
- */
-/*
- * ID Reg. 0 NAND_IDR0
- */
-#define NAND_IDR0	(PKUNITY_NAND_BASE + 0x0000)
-/*
- * ID Reg. 1 NAND_IDR1
- */
-#define NAND_IDR1	(PKUNITY_NAND_BASE + 0x0004)
-/*
- * ID Reg. 2 NAND_IDR2
- */
-#define NAND_IDR2	(PKUNITY_NAND_BASE + 0x0008)
-/*
- * ID Reg. 3 NAND_IDR3
- */
-#define NAND_IDR3	(PKUNITY_NAND_BASE + 0x000C)
-/*
- * Page Address Reg 0 NAND_PAR0
- */
-#define NAND_PAR0	(PKUNITY_NAND_BASE + 0x0010)
-/*
- * Page Address Reg 1 NAND_PAR1
- */
-#define NAND_PAR1	(PKUNITY_NAND_BASE + 0x0014)
-/*
- * Page Address Reg 2 NAND_PAR2
- */
-#define NAND_PAR2	(PKUNITY_NAND_BASE + 0x0018)
-/*
- * ECC Enable Reg NAND_ECCEN
- */
-#define NAND_ECCEN	(PKUNITY_NAND_BASE + 0x001C)
-/*
- * Buffer Reg NAND_BUF
- */
-#define NAND_BUF	(PKUNITY_NAND_BASE + 0x0020)
-/*
- * ECC Status Reg NAND_ECCSR
- */
-#define NAND_ECCSR	(PKUNITY_NAND_BASE + 0x0024)
-/*
- * Command Reg NAND_CMD
- */
-#define NAND_CMD	(PKUNITY_NAND_BASE + 0x0028)
-/*
- * DMA Configure Reg NAND_DMACR
- */
-#define NAND_DMACR	(PKUNITY_NAND_BASE + 0x002C)
-/*
- * Interrupt Reg NAND_IR
- */
-#define NAND_IR		(PKUNITY_NAND_BASE + 0x0030)
-/*
- * Interrupt Mask Reg NAND_IMR
- */
-#define NAND_IMR	(PKUNITY_NAND_BASE + 0x0034)
-/*
- * Chip Enable Reg NAND_CHIPEN
- */
-#define NAND_CHIPEN	(PKUNITY_NAND_BASE + 0x0038)
-/*
- * Address Reg NAND_ADDR
- */
-#define NAND_ADDR	(PKUNITY_NAND_BASE + 0x003C)
-
-/*
- * Command bits NAND_CMD_CMD_MASK
- */
-#define NAND_CMD_CMD_MASK		FMASK(4, 4)
-#define NAND_CMD_CMD_READPAGE		FIELD(0x0, 4, 4)
-#define NAND_CMD_CMD_ERASEBLOCK		FIELD(0x6, 4, 4)
-#define NAND_CMD_CMD_READSTATUS		FIELD(0x7, 4, 4)
-#define NAND_CMD_CMD_WRITEPAGE		FIELD(0x8, 4, 4)
-#define NAND_CMD_CMD_READID		FIELD(0x9, 4, 4)
-#define NAND_CMD_CMD_RESET		FIELD(0xf, 4, 4)
-
diff --git a/arch/unicore32/include/mach/regs-ost.h b/arch/unicore32/include/mach/regs-ost.h
deleted file mode 100644
index 6c63e7b7569e..000000000000
--- a/arch/unicore32/include/mach/regs-ost.h
+++ /dev/null
@@ -1,91 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * PKUnity Operating System Timer (OST) Registers
- */
-/*
- * Match Reg 0 OST_OSMR0
- */
-#define OST_OSMR0	(PKUNITY_OST_BASE + 0x0000)
-/*
- * Match Reg 1 OST_OSMR1
- */
-#define OST_OSMR1	(PKUNITY_OST_BASE + 0x0004)
-/*
- * Match Reg 2 OST_OSMR2
- */
-#define OST_OSMR2	(PKUNITY_OST_BASE + 0x0008)
-/*
- * Match Reg 3 OST_OSMR3
- */
-#define OST_OSMR3	(PKUNITY_OST_BASE + 0x000C)
-/*
- * Counter Reg OST_OSCR
- */
-#define OST_OSCR	(PKUNITY_OST_BASE + 0x0010)
-/*
- * Status Reg OST_OSSR
- */
-#define OST_OSSR	(PKUNITY_OST_BASE + 0x0014)
-/*
- * Watchdog Enable Reg OST_OWER
- */
-#define OST_OWER	(PKUNITY_OST_BASE + 0x0018)
-/*
- * Interrupt Enable Reg OST_OIER
- */
-#define OST_OIER	(PKUNITY_OST_BASE + 0x001C)
-
-/*
- * PWM Registers: IO base address: PKUNITY_OST_BASE + 0x80
- *      PWCR: Pulse Width Control Reg
- *      DCCR: Duty Cycle Control Reg
- *      PCR: Period Control Reg
- */
-#define OST_PWM_PWCR	(0x00)
-#define OST_PWM_DCCR	(0x04)
-#define OST_PWM_PCR 	(0x08)
-
-/*
- * Match detected 0 OST_OSSR_M0
- */
-#define OST_OSSR_M0		FIELD(1, 1, 0)
-/*
- * Match detected 1 OST_OSSR_M1
- */
-#define OST_OSSR_M1		FIELD(1, 1, 1)
-/*
- * Match detected 2 OST_OSSR_M2
- */
-#define OST_OSSR_M2		FIELD(1, 1, 2)
-/*
- * Match detected 3 OST_OSSR_M3
- */
-#define OST_OSSR_M3		FIELD(1, 1, 3)
-
-/*
- * Interrupt enable 0 OST_OIER_E0
- */
-#define OST_OIER_E0		FIELD(1, 1, 0)
-/*
- * Interrupt enable 1 OST_OIER_E1
- */
-#define OST_OIER_E1		FIELD(1, 1, 1)
-/*
- * Interrupt enable 2 OST_OIER_E2
- */
-#define OST_OIER_E2		FIELD(1, 1, 2)
-/*
- * Interrupt enable 3 OST_OIER_E3
- */
-#define OST_OIER_E3		FIELD(1, 1, 3)
-
-/*
- * Watchdog Match Enable OST_OWER_WME
- */
-#define OST_OWER_WME		FIELD(1, 1, 0)
-
-/*
- * PWM Full Duty Cycle OST_PWMDCCR_FDCYCLE
- */
-#define OST_PWMDCCR_FDCYCLE	FIELD(1, 1, 10)
-
diff --git a/arch/unicore32/include/mach/regs-pci.h b/arch/unicore32/include/mach/regs-pci.h
deleted file mode 100644
index 25bb307b87c3..000000000000
--- a/arch/unicore32/include/mach/regs-pci.h
+++ /dev/null
@@ -1,95 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * PKUnity AHB-PCI Bridge Registers
- */
-
-/*
- * AHB/PCI fixed physical address for pci addess configuration
- */
-/*
- * PCICFG Bridge Base Reg.
- */
-#define PCICFG_BRIBASE          (PKUNITY_PCICFG_BASE + 0x0000)
-/*
- * PCICFG Address Reg.
- */
-#define PCICFG_ADDR             (PKUNITY_PCICFG_BASE + 0x0004)
-/*
- * PCICFG Address Reg.
- */
-#define PCICFG_DATA             (PKUNITY_PCICFG_BASE + 0x0008)
-
-/*
- * PCI Bridge configuration space
- */
-#define PCIBRI_ID		(PKUNITY_PCIBRI_BASE + 0x0000)
-#define PCIBRI_CMD		(PKUNITY_PCIBRI_BASE + 0x0004)
-#define PCIBRI_CLASS		(PKUNITY_PCIBRI_BASE + 0x0008)
-#define PCIBRI_LTR		(PKUNITY_PCIBRI_BASE + 0x000C)
-#define PCIBRI_BAR0		(PKUNITY_PCIBRI_BASE + 0x0010)
-#define PCIBRI_BAR1		(PKUNITY_PCIBRI_BASE + 0x0014)
-#define PCIBRI_BAR2		(PKUNITY_PCIBRI_BASE + 0x0018)
-#define PCIBRI_BAR3		(PKUNITY_PCIBRI_BASE + 0x001C)
-#define PCIBRI_BAR4		(PKUNITY_PCIBRI_BASE + 0x0020)
-#define PCIBRI_BAR5		(PKUNITY_PCIBRI_BASE + 0x0024)
-
-#define PCIBRI_PCICTL0		(PKUNITY_PCIBRI_BASE + 0x0100)
-#define PCIBRI_PCIBAR0		(PKUNITY_PCIBRI_BASE + 0x0104)
-#define PCIBRI_PCIAMR0		(PKUNITY_PCIBRI_BASE + 0x0108)
-#define PCIBRI_PCITAR0		(PKUNITY_PCIBRI_BASE + 0x010C)
-#define PCIBRI_PCICTL1		(PKUNITY_PCIBRI_BASE + 0x0110)
-#define PCIBRI_PCIBAR1		(PKUNITY_PCIBRI_BASE + 0x0114)
-#define PCIBRI_PCIAMR1		(PKUNITY_PCIBRI_BASE + 0x0118)
-#define PCIBRI_PCITAR1		(PKUNITY_PCIBRI_BASE + 0x011C)
-#define PCIBRI_PCICTL2		(PKUNITY_PCIBRI_BASE + 0x0120)
-#define PCIBRI_PCIBAR2		(PKUNITY_PCIBRI_BASE + 0x0124)
-#define PCIBRI_PCIAMR2		(PKUNITY_PCIBRI_BASE + 0x0128)
-#define PCIBRI_PCITAR2		(PKUNITY_PCIBRI_BASE + 0x012C)
-#define PCIBRI_PCICTL3		(PKUNITY_PCIBRI_BASE + 0x0130)
-#define PCIBRI_PCIBAR3		(PKUNITY_PCIBRI_BASE + 0x0134)
-#define PCIBRI_PCIAMR3		(PKUNITY_PCIBRI_BASE + 0x0138)
-#define PCIBRI_PCITAR3		(PKUNITY_PCIBRI_BASE + 0x013C)
-#define PCIBRI_PCICTL4		(PKUNITY_PCIBRI_BASE + 0x0140)
-#define PCIBRI_PCIBAR4		(PKUNITY_PCIBRI_BASE + 0x0144)
-#define PCIBRI_PCIAMR4		(PKUNITY_PCIBRI_BASE + 0x0148)
-#define PCIBRI_PCITAR4		(PKUNITY_PCIBRI_BASE + 0x014C)
-#define PCIBRI_PCICTL5		(PKUNITY_PCIBRI_BASE + 0x0150)
-#define PCIBRI_PCIBAR5		(PKUNITY_PCIBRI_BASE + 0x0154)
-#define PCIBRI_PCIAMR5		(PKUNITY_PCIBRI_BASE + 0x0158)
-#define PCIBRI_PCITAR5		(PKUNITY_PCIBRI_BASE + 0x015C)
-
-#define PCIBRI_AHBCTL0		(PKUNITY_PCIBRI_BASE + 0x0180)
-#define PCIBRI_AHBBAR0		(PKUNITY_PCIBRI_BASE + 0x0184)
-#define PCIBRI_AHBAMR0		(PKUNITY_PCIBRI_BASE + 0x0188)
-#define PCIBRI_AHBTAR0		(PKUNITY_PCIBRI_BASE + 0x018C)
-#define PCIBRI_AHBCTL1		(PKUNITY_PCIBRI_BASE + 0x0190)
-#define PCIBRI_AHBBAR1		(PKUNITY_PCIBRI_BASE + 0x0194)
-#define PCIBRI_AHBAMR1		(PKUNITY_PCIBRI_BASE + 0x0198)
-#define PCIBRI_AHBTAR1		(PKUNITY_PCIBRI_BASE + 0x019C)
-#define PCIBRI_AHBCTL2		(PKUNITY_PCIBRI_BASE + 0x01A0)
-#define PCIBRI_AHBBAR2		(PKUNITY_PCIBRI_BASE + 0x01A4)
-#define PCIBRI_AHBAMR2		(PKUNITY_PCIBRI_BASE + 0x01A8)
-#define PCIBRI_AHBTAR2		(PKUNITY_PCIBRI_BASE + 0x01AC)
-#define PCIBRI_AHBCTL3		(PKUNITY_PCIBRI_BASE + 0x01B0)
-#define PCIBRI_AHBBAR3		(PKUNITY_PCIBRI_BASE + 0x01B4)
-#define PCIBRI_AHBAMR3		(PKUNITY_PCIBRI_BASE + 0x01B8)
-#define PCIBRI_AHBTAR3		(PKUNITY_PCIBRI_BASE + 0x01BC)
-#define PCIBRI_AHBCTL4		(PKUNITY_PCIBRI_BASE + 0x01C0)
-#define PCIBRI_AHBBAR4		(PKUNITY_PCIBRI_BASE + 0x01C4)
-#define PCIBRI_AHBAMR4		(PKUNITY_PCIBRI_BASE + 0x01C8)
-#define PCIBRI_AHBTAR4		(PKUNITY_PCIBRI_BASE + 0x01CC)
-#define PCIBRI_AHBCTL5		(PKUNITY_PCIBRI_BASE + 0x01D0)
-#define PCIBRI_AHBBAR5		(PKUNITY_PCIBRI_BASE + 0x01D4)
-#define PCIBRI_AHBAMR5		(PKUNITY_PCIBRI_BASE + 0x01D8)
-#define PCIBRI_AHBTAR5		(PKUNITY_PCIBRI_BASE + 0x01DC)
-
-#define PCIBRI_CTLx_AT          FIELD(1, 1, 2)
-#define PCIBRI_CTLx_PREF        FIELD(1, 1, 1)
-#define PCIBRI_CTLx_MRL         FIELD(1, 1, 0)
-
-#define PCIBRI_BARx_ADDR        FIELD(0xFFFFFFFC, 30, 2)
-#define PCIBRI_BARx_IO          FIELD(1, 1, 0)
-#define PCIBRI_BARx_MEM         FIELD(0, 1, 0)
-
-#define PCIBRI_CMD_IO           FIELD(1, 1, 0)
-#define PCIBRI_CMD_MEM          FIELD(1, 1, 1)
diff --git a/arch/unicore32/include/mach/regs-pm.h b/arch/unicore32/include/mach/regs-pm.h
deleted file mode 100644
index 777b1ace39b9..000000000000
--- a/arch/unicore32/include/mach/regs-pm.h
+++ /dev/null
@@ -1,127 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * PKUNITY Power Manager (PM) Registers
- */
-/*
- * PM Control Reg PM_PMCR
- */
-#define PM_PMCR                 (PKUNITY_PM_BASE + 0x0000)
-/*
- * PM General Conf. Reg PM_PGCR
- */
-#define PM_PGCR                 (PKUNITY_PM_BASE + 0x0004)
-/*
- * PM PLL Conf. Reg PM_PPCR
- */
-#define PM_PPCR                 (PKUNITY_PM_BASE + 0x0008)
-/*
- * PM Wakeup Enable Reg PM_PWER
- */
-#define PM_PWER                 (PKUNITY_PM_BASE + 0x000C)
-/*
- * PM GPIO Sleep Status Reg PM_PGSR
- */
-#define PM_PGSR                 (PKUNITY_PM_BASE + 0x0010)
-/*
- * PM Clock Gate Reg PM_PCGR
- */
-#define PM_PCGR                 (PKUNITY_PM_BASE + 0x0014)
-/*
- * PM SYS PLL Conf. Reg PM_PLLSYSCFG
- */
-#define PM_PLLSYSCFG            (PKUNITY_PM_BASE + 0x0018)
-/*
- * PM DDR PLL Conf. Reg PM_PLLDDRCFG
- */
-#define PM_PLLDDRCFG            (PKUNITY_PM_BASE + 0x001C)
-/*
- * PM VGA PLL Conf. Reg PM_PLLVGACFG
- */
-#define PM_PLLVGACFG            (PKUNITY_PM_BASE + 0x0020)
-/*
- * PM Div Conf. Reg PM_DIVCFG
- */
-#define PM_DIVCFG               (PKUNITY_PM_BASE + 0x0024)
-/*
- * PM SYS PLL Status Reg PM_PLLSYSSTATUS
- */
-#define PM_PLLSYSSTATUS         (PKUNITY_PM_BASE + 0x0028)
-/*
- * PM DDR PLL Status Reg PM_PLLDDRSTATUS
- */
-#define PM_PLLDDRSTATUS         (PKUNITY_PM_BASE + 0x002C)
-/*
- * PM VGA PLL Status Reg PM_PLLVGASTATUS
- */
-#define PM_PLLVGASTATUS         (PKUNITY_PM_BASE + 0x0030)
-/*
- * PM Div Status Reg PM_DIVSTATUS
- */
-#define PM_DIVSTATUS            (PKUNITY_PM_BASE + 0x0034)
-/*
- * PM Software Reset Reg PM_SWRESET
- */
-#define PM_SWRESET              (PKUNITY_PM_BASE + 0x0038)
-/*
- * PM DDR2 PAD Start Reg PM_DDR2START
- */
-#define PM_DDR2START            (PKUNITY_PM_BASE + 0x003C)
-/*
- * PM DDR2 PAD Status Reg PM_DDR2CAL0
- */
-#define PM_DDR2CAL0             (PKUNITY_PM_BASE + 0x0040)
-/*
- * PM PLL DFC Done Reg PM_PLLDFCDONE
- */
-#define PM_PLLDFCDONE           (PKUNITY_PM_BASE + 0x0044)
-
-#define PM_PMCR_SFB             FIELD(1, 1, 0)
-#define PM_PMCR_IFB             FIELD(1, 1, 1)
-#define PM_PMCR_CFBSYS          FIELD(1, 1, 2)
-#define PM_PMCR_CFBDDR          FIELD(1, 1, 3)
-#define PM_PMCR_CFBVGA          FIELD(1, 1, 4)
-#define PM_PMCR_CFBDIVBCLK      FIELD(1, 1, 5)
-
-/*
- * GPIO 8~27 wake-up enable PM_PWER_GPIOHIGH
- */
-#define PM_PWER_GPIOHIGH        FIELD(1, 1, 8)
-/*
- * RTC alarm wake-up enable PM_PWER_RTC
- */
-#define PM_PWER_RTC             FIELD(1, 1, 31)
-
-#define PM_PCGR_BCLK64DDR	FIELD(1, 1, 0)
-#define PM_PCGR_BCLK64VGA	FIELD(1, 1, 1)
-#define PM_PCGR_BCLKDDR		FIELD(1, 1, 2)
-#define PM_PCGR_BCLKPCI		FIELD(1, 1, 4)
-#define PM_PCGR_BCLKDMAC	FIELD(1, 1, 5)
-#define PM_PCGR_BCLKUMAL	FIELD(1, 1, 6)
-#define PM_PCGR_BCLKUSB		FIELD(1, 1, 7)
-#define PM_PCGR_BCLKMME		FIELD(1, 1, 10)
-#define PM_PCGR_BCLKNAND	FIELD(1, 1, 11)
-#define PM_PCGR_BCLKH264E	FIELD(1, 1, 12)
-#define PM_PCGR_BCLKVGA		FIELD(1, 1, 13)
-#define PM_PCGR_BCLKH264D	FIELD(1, 1, 14)
-#define PM_PCGR_VECLK		FIELD(1, 1, 15)
-#define PM_PCGR_HECLK		FIELD(1, 1, 16)
-#define PM_PCGR_HDCLK		FIELD(1, 1, 17)
-#define PM_PCGR_NANDCLK		FIELD(1, 1, 18)
-#define PM_PCGR_GECLK		FIELD(1, 1, 19)
-#define PM_PCGR_VGACLK          FIELD(1, 1, 20)
-#define PM_PCGR_PCICLK		FIELD(1, 1, 21)
-#define PM_PCGR_SATACLK		FIELD(1, 1, 25)
-
-/*
- * [23:20]PM_DIVCFG_VGACLK(v)
- */
-#define PM_DIVCFG_VGACLK_MASK   FMASK(4, 20)
-#define PM_DIVCFG_VGACLK(v)	FIELD((v), 4, 20)
-
-#define PM_SWRESET_USB          FIELD(1, 1, 6)
-#define PM_SWRESET_VGADIV       FIELD(1, 1, 26)
-#define PM_SWRESET_GEDIV        FIELD(1, 1, 27)
-
-#define PM_PLLDFCDONE_SYSDFC    FIELD(1, 1, 0)
-#define PM_PLLDFCDONE_DDRDFC    FIELD(1, 1, 1)
-#define PM_PLLDFCDONE_VGADFC    FIELD(1, 1, 2)
diff --git a/arch/unicore32/include/mach/regs-ps2.h b/arch/unicore32/include/mach/regs-ps2.h
deleted file mode 100644
index d539d7482462..000000000000
--- a/arch/unicore32/include/mach/regs-ps2.h
+++ /dev/null
@@ -1,21 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * PKUnity PS2 Controller Registers
- */
-/*
- * the same as I8042_DATA_REG PS2_DATA
- */
-#define PS2_DATA	(PKUNITY_PS2_BASE + 0x0060)
-/*
- * the same as I8042_COMMAND_REG PS2_COMMAND
- */
-#define PS2_COMMAND	(PKUNITY_PS2_BASE + 0x0064)
-/*
- * the same as I8042_STATUS_REG PS2_STATUS
- */
-#define PS2_STATUS	(PKUNITY_PS2_BASE + 0x0064)
-/*
- * counter reg PS2_CNT
- */
-#define PS2_CNT		(PKUNITY_PS2_BASE + 0x0068)
-
diff --git a/arch/unicore32/include/mach/regs-resetc.h b/arch/unicore32/include/mach/regs-resetc.h
deleted file mode 100644
index 5f2b9d77a9ec..000000000000
--- a/arch/unicore32/include/mach/regs-resetc.h
+++ /dev/null
@@ -1,35 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * PKUnity Reset Controller (RC) Registers
- */
-/*
- * Software Reset Register
- */
-#define RESETC_SWRR	(PKUNITY_RESETC_BASE + 0x0000)
-/*
- * Reset Status Register
- */
-#define RESETC_RSSR	(PKUNITY_RESETC_BASE + 0x0004)
-
-/*
- * Software Reset Bit
- */
-#define RESETC_SWRR_SRB		FIELD(1, 1, 0)
-
-/*
- * Hardware Reset
- */
-#define RESETC_RSSR_HWR		FIELD(1, 1, 0)
-/*
- * Software Reset
- */
-#define RESETC_RSSR_SWR		FIELD(1, 1, 1)
-/*
- * Watchdog Reset
- */
-#define RESETC_RSSR_WDR		FIELD(1, 1, 2)
-/*
- * Sleep Mode Reset
- */
-#define RESETC_RSSR_SMR		FIELD(1, 1, 3)
-
diff --git a/arch/unicore32/include/mach/regs-rtc.h b/arch/unicore32/include/mach/regs-rtc.h
deleted file mode 100644
index f2f7f47eb65e..000000000000
--- a/arch/unicore32/include/mach/regs-rtc.h
+++ /dev/null
@@ -1,38 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * PKUnity Real-Time Clock (RTC) control registers
- */
-/*
- * RTC Alarm Reg RTC_RTAR
- */
-#define RTC_RTAR	(PKUNITY_RTC_BASE + 0x0000)
-/*
- * RTC Count Reg RTC_RCNR
- */
-#define RTC_RCNR	(PKUNITY_RTC_BASE + 0x0004)
-/*
- * RTC Trim Reg RTC_RTTR
- */
-#define RTC_RTTR	(PKUNITY_RTC_BASE + 0x0008)
-/*
- * RTC Status Reg RTC_RTSR
- */
-#define RTC_RTSR	(PKUNITY_RTC_BASE + 0x0010)
-
-/*
- * ALarm detected RTC_RTSR_AL
- */
-#define RTC_RTSR_AL		FIELD(1, 1, 0)
-/*
- * 1 Hz clock detected RTC_RTSR_HZ
- */
-#define RTC_RTSR_HZ		FIELD(1, 1, 1)
-/*
- * ALarm interrupt Enable RTC_RTSR_ALE
- */
-#define RTC_RTSR_ALE		FIELD(1, 1, 2)
-/*
- * 1 Hz clock interrupt Enable RTC_RTSR_HZE
- */
-#define RTC_RTSR_HZE		FIELD(1, 1, 3)
-
diff --git a/arch/unicore32/include/mach/regs-sdc.h b/arch/unicore32/include/mach/regs-sdc.h
deleted file mode 100644
index 658bfaf4cb3c..000000000000
--- a/arch/unicore32/include/mach/regs-sdc.h
+++ /dev/null
@@ -1,157 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * PKUnity Multi-Media Card and Security Digital Card (MMC/SD) Registers
- */
-/*
- * Clock Control Reg SDC_CCR
- */
-#define SDC_CCR		(PKUNITY_SDC_BASE + 0x0000)
-/*
- * Software Reset Reg SDC_SRR
- */
-#define SDC_SRR		(PKUNITY_SDC_BASE + 0x0004)
-/*
- * Argument Reg SDC_ARGUMENT
- */
-#define SDC_ARGUMENT	(PKUNITY_SDC_BASE + 0x0008)
-/*
- * Command Reg SDC_COMMAND
- */
-#define SDC_COMMAND	(PKUNITY_SDC_BASE + 0x000C)
-/*
- * Block Size Reg SDC_BLOCKSIZE
- */
-#define SDC_BLOCKSIZE	(PKUNITY_SDC_BASE + 0x0010)
-/*
- * Block Cound Reg SDC_BLOCKCOUNT
- */
-#define SDC_BLOCKCOUNT	(PKUNITY_SDC_BASE + 0x0014)
-/*
- * Transfer Mode Reg SDC_TMR
- */
-#define SDC_TMR		(PKUNITY_SDC_BASE + 0x0018)
-/*
- * Response Reg. 0 SDC_RES0
- */
-#define SDC_RES0	(PKUNITY_SDC_BASE + 0x001C)
-/*
- * Response Reg. 1 SDC_RES1
- */
-#define SDC_RES1	(PKUNITY_SDC_BASE + 0x0020)
-/*
- * Response Reg. 2 SDC_RES2
- */
-#define SDC_RES2	(PKUNITY_SDC_BASE + 0x0024)
-/*
- * Response Reg. 3 SDC_RES3
- */
-#define SDC_RES3	(PKUNITY_SDC_BASE + 0x0028)
-/*
- * Read Timeout Control Reg SDC_RTCR
- */
-#define SDC_RTCR	(PKUNITY_SDC_BASE + 0x002C)
-/*
- * Interrupt Status Reg SDC_ISR
- */
-#define SDC_ISR		(PKUNITY_SDC_BASE + 0x0030)
-/*
- * Interrupt Status Mask Reg SDC_ISMR
- */
-#define SDC_ISMR	(PKUNITY_SDC_BASE + 0x0034)
-/*
- * RX FIFO SDC_RXFIFO
- */
-#define SDC_RXFIFO	(PKUNITY_SDC_BASE + 0x0038)
-/*
- * TX FIFO SDC_TXFIFO
- */
-#define SDC_TXFIFO	(PKUNITY_SDC_BASE + 0x003C)
-
-/*
- * SD Clock Enable SDC_CCR_CLKEN
- */
-#define SDC_CCR_CLKEN			FIELD(1, 1, 2)
-/*
- * [15:8] SDC_CCR_PDIV(v)
- */
-#define SDC_CCR_PDIV(v)			FIELD((v), 8, 8)
-
-/*
- * Software reset enable SDC_SRR_ENABLE
- */
-#define SDC_SRR_ENABLE			FIELD(0, 1, 0)
-/*
- * Software reset disable SDC_SRR_DISABLE
- */
-#define SDC_SRR_DISABLE			FIELD(1, 1, 0)
-
-/*
- * Response type SDC_COMMAND_RESTYPE_MASK
- */
-#define SDC_COMMAND_RESTYPE_MASK	FMASK(2, 0)
-/*
- * No response SDC_COMMAND_RESTYPE_NONE
- */
-#define SDC_COMMAND_RESTYPE_NONE	FIELD(0, 2, 0)
-/*
- * 136-bit long response SDC_COMMAND_RESTYPE_LONG
- */
-#define SDC_COMMAND_RESTYPE_LONG	FIELD(1, 2, 0)
-/*
- * 48-bit short response SDC_COMMAND_RESTYPE_SHORT
- */
-#define SDC_COMMAND_RESTYPE_SHORT	FIELD(2, 2, 0)
-/*
- * 48-bit short and test if busy response SDC_COMMAND_RESTYPE_SHORTBUSY
- */
-#define SDC_COMMAND_RESTYPE_SHORTBUSY	FIELD(3, 2, 0)
-/*
- * data ready SDC_COMMAND_DATAREADY
- */
-#define SDC_COMMAND_DATAREADY		FIELD(1, 1, 2)
-#define SDC_COMMAND_CMDEN		FIELD(1, 1, 3)
-/*
- * [10:5] SDC_COMMAND_CMDINDEX(v)
- */
-#define SDC_COMMAND_CMDINDEX(v)		FIELD((v), 6, 5)
-
-/*
- * [10:0] SDC_BLOCKSIZE_BSMASK(v)
- */
-#define SDC_BLOCKSIZE_BSMASK(v)		FIELD((v), 11, 0)
-/*
- * [11:0] SDC_BLOCKCOUNT_BCMASK(v)
- */
-#define SDC_BLOCKCOUNT_BCMASK(v)	FIELD((v), 12, 0)
-
-/*
- * Data Width 1bit SDC_TMR_WTH_1BIT
- */
-#define SDC_TMR_WTH_1BIT		FIELD(0, 1, 0)
-/*
- * Data Width 4bit SDC_TMR_WTH_4BIT
- */
-#define SDC_TMR_WTH_4BIT		FIELD(1, 1, 0)
-/*
- * Read SDC_TMR_DIR_READ
- */
-#define SDC_TMR_DIR_READ		FIELD(0, 1, 1)
-/*
- * Write SDC_TMR_DIR_WRITE
- */
-#define SDC_TMR_DIR_WRITE		FIELD(1, 1, 1)
-
-#define SDC_IR_MASK			FMASK(13, 0)
-#define SDC_IR_RESTIMEOUT		FIELD(1, 1, 0)
-#define SDC_IR_WRITECRC			FIELD(1, 1, 1)
-#define SDC_IR_READCRC			FIELD(1, 1, 2)
-#define SDC_IR_TXFIFOREAD		FIELD(1, 1, 3)
-#define SDC_IR_RXFIFOWRITE		FIELD(1, 1, 4)
-#define SDC_IR_READTIMEOUT		FIELD(1, 1, 5)
-#define SDC_IR_DATACOMPLETE		FIELD(1, 1, 6)
-#define SDC_IR_CMDCOMPLETE		FIELD(1, 1, 7)
-#define SDC_IR_RXFIFOFULL		FIELD(1, 1, 8)
-#define SDC_IR_RXFIFOEMPTY		FIELD(1, 1, 9)
-#define SDC_IR_TXFIFOFULL		FIELD(1, 1, 10)
-#define SDC_IR_TXFIFOEMPTY		FIELD(1, 1, 11)
-#define SDC_IR_ENDCMDWITHRES		FIELD(1, 1, 12)
diff --git a/arch/unicore32/include/mach/regs-spi.h b/arch/unicore32/include/mach/regs-spi.h
deleted file mode 100644
index 3460647a9c2a..000000000000
--- a/arch/unicore32/include/mach/regs-spi.h
+++ /dev/null
@@ -1,99 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * PKUnity Serial Peripheral Interface (SPI) Registers
- */
-/*
- * Control reg. 0 SPI_CR0
- */
-#define SPI_CR0		(PKUNITY_SPI_BASE + 0x0000)
-/*
- * Control reg. 1 SPI_CR1
- */
-#define SPI_CR1		(PKUNITY_SPI_BASE + 0x0004)
-/*
- * Enable reg SPI_SSIENR
- */
-#define SPI_SSIENR	(PKUNITY_SPI_BASE + 0x0008)
-/*
- * Status reg SPI_SR
- */
-#define SPI_SR		(PKUNITY_SPI_BASE + 0x0028)
-/*
- * Interrupt Mask reg SPI_IMR
- */
-#define SPI_IMR		(PKUNITY_SPI_BASE + 0x002C)
-/*
- * Interrupt Status reg SPI_ISR
- */
-#define SPI_ISR		(PKUNITY_SPI_BASE + 0x0030)
-
-/*
- * Enable SPI Controller SPI_SSIENR_EN
- */
-#define SPI_SSIENR_EN		FIELD(1, 1, 0)
-
-/*
- * SPI Busy SPI_SR_BUSY
- */
-#define SPI_SR_BUSY		FIELD(1, 1, 0)
-/*
- * Transmit FIFO Not Full SPI_SR_TFNF
- */
-#define SPI_SR_TFNF		FIELD(1, 1, 1)
-/*
- * Transmit FIFO Empty SPI_SR_TFE
- */
-#define SPI_SR_TFE		FIELD(1, 1, 2)
-/*
- * Receive FIFO Not Empty SPI_SR_RFNE
- */
-#define SPI_SR_RFNE		FIELD(1, 1, 3)
-/*
- * Receive FIFO Full SPI_SR_RFF
- */
-#define SPI_SR_RFF		FIELD(1, 1, 4)
-
-/*
- * Trans. FIFO Empty Interrupt Status SPI_ISR_TXEIS
- */
-#define SPI_ISR_TXEIS		FIELD(1, 1, 0)
-/*
- * Trans. FIFO Overflow Interrupt Status SPI_ISR_TXOIS
- */
-#define SPI_ISR_TXOIS		FIELD(1, 1, 1)
-/*
- * Receiv. FIFO Underflow Interrupt Status SPI_ISR_RXUIS
- */
-#define SPI_ISR_RXUIS		FIELD(1, 1, 2)
-/*
- * Receiv. FIFO Overflow Interrupt Status SPI_ISR_RXOIS
- */
-#define SPI_ISR_RXOIS		FIELD(1, 1, 3)
-/*
- * Receiv. FIFO Full Interrupt Status SPI_ISR_RXFIS
- */
-#define SPI_ISR_RXFIS		FIELD(1, 1, 4)
-#define SPI_ISR_MSTIS		FIELD(1, 1, 5)
-
-/*
- * Trans. FIFO Empty Interrupt Mask SPI_IMR_TXEIM
- */
-#define SPI_IMR_TXEIM		FIELD(1, 1, 0)
-/*
- * Trans. FIFO Overflow Interrupt Mask SPI_IMR_TXOIM
- */
-#define SPI_IMR_TXOIM		FIELD(1, 1, 1)
-/*
- * Receiv. FIFO Underflow Interrupt Mask SPI_IMR_RXUIM
- */
-#define SPI_IMR_RXUIM		FIELD(1, 1, 2)
-/*
- * Receiv. FIFO Overflow Interrupt Mask SPI_IMR_RXOIM
- */
-#define SPI_IMR_RXOIM		FIELD(1, 1, 3)
-/*
- * Receiv. FIFO Full Interrupt Mask SPI_IMR_RXFIM
- */
-#define SPI_IMR_RXFIM		FIELD(1, 1, 4)
-#define SPI_IMR_MSTIM		FIELD(1, 1, 5)
-
diff --git a/arch/unicore32/include/mach/regs-uart.h b/arch/unicore32/include/mach/regs-uart.h
deleted file mode 100644
index 9fa6b1938b77..000000000000
--- a/arch/unicore32/include/mach/regs-uart.h
+++ /dev/null
@@ -1,3 +0,0 @@
-/*
- * PKUnity Universal Asynchronous Receiver/Transmitter (UART) Registers
- */
diff --git a/arch/unicore32/include/mach/regs-umal.h b/arch/unicore32/include/mach/regs-umal.h
deleted file mode 100644
index 7023089c61c6..000000000000
--- a/arch/unicore32/include/mach/regs-umal.h
+++ /dev/null
@@ -1,230 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * PKUnity Ultra Media Access Layer (UMAL) Ethernet MAC Registers
- */
-
-/* MAC module of UMAL */
-/* UMAL's MAC module includes G/MII interface, several additional PHY
- * interfaces, and MAC control sub-layer, which provides support for control
- * frames (e.g. PAUSE frames).
- */
-/*
- * TX/RX reset and control UMAL_CFG1
- */
-#define UMAL_CFG1		(PKUNITY_UMAL_BASE + 0x0000)
-/*
- * MAC interface mode control UMAL_CFG2
- */
-#define UMAL_CFG2		(PKUNITY_UMAL_BASE + 0x0004)
-/*
- * Inter Packet/Frame Gap UMAL_IPGIFG
- */
-#define UMAL_IPGIFG		(PKUNITY_UMAL_BASE + 0x0008)
-/*
- * Collision retry or backoff UMAL_HALFDUPLEX
- */
-#define UMAL_HALFDUPLEX		(PKUNITY_UMAL_BASE + 0x000c)
-/*
- * Maximum Frame Length UMAL_MAXFRAME
- */
-#define UMAL_MAXFRAME		(PKUNITY_UMAL_BASE + 0x0010)
-/*
- * Test Regsiter UMAL_TESTREG
- */
-#define UMAL_TESTREG		(PKUNITY_UMAL_BASE + 0x001c)
-/*
- * MII Management Configure UMAL_MIICFG
- */
-#define UMAL_MIICFG		(PKUNITY_UMAL_BASE + 0x0020)
-/*
- * MII Management Command UMAL_MIICMD
- */
-#define UMAL_MIICMD		(PKUNITY_UMAL_BASE + 0x0024)
-/*
- * MII Management Address UMAL_MIIADDR
- */
-#define UMAL_MIIADDR		(PKUNITY_UMAL_BASE + 0x0028)
-/*
- * MII Management Control UMAL_MIICTRL
- */
-#define UMAL_MIICTRL		(PKUNITY_UMAL_BASE + 0x002c)
-/*
- * MII Management Status UMAL_MIISTATUS
- */
-#define UMAL_MIISTATUS		(PKUNITY_UMAL_BASE + 0x0030)
-/*
- * MII Management Indicator UMAL_MIIIDCT
- */
-#define UMAL_MIIIDCT		(PKUNITY_UMAL_BASE + 0x0034)
-/*
- * Interface Control UMAL_IFCTRL
- */
-#define UMAL_IFCTRL		(PKUNITY_UMAL_BASE + 0x0038)
-/*
- * Interface Status UMAL_IFSTATUS
- */
-#define UMAL_IFSTATUS		(PKUNITY_UMAL_BASE + 0x003c)
-/*
- * MAC address (high 4 bytes) UMAL_STADDR1
- */
-#define UMAL_STADDR1		(PKUNITY_UMAL_BASE + 0x0040)
-/*
- * MAC address (low 2 bytes) UMAL_STADDR2
- */
-#define UMAL_STADDR2		(PKUNITY_UMAL_BASE + 0x0044)
-
-/* FIFO MODULE OF UMAL */
-/* UMAL's FIFO module provides data queuing for increased system level
- * throughput
- */
-#define UMAL_FIFOCFG0		(PKUNITY_UMAL_BASE + 0x0048)
-#define UMAL_FIFOCFG1		(PKUNITY_UMAL_BASE + 0x004c)
-#define UMAL_FIFOCFG2		(PKUNITY_UMAL_BASE + 0x0050)
-#define UMAL_FIFOCFG3		(PKUNITY_UMAL_BASE + 0x0054)
-#define UMAL_FIFOCFG4		(PKUNITY_UMAL_BASE + 0x0058)
-#define UMAL_FIFOCFG5		(PKUNITY_UMAL_BASE + 0x005c)
-#define UMAL_FIFORAM0		(PKUNITY_UMAL_BASE + 0x0060)
-#define UMAL_FIFORAM1		(PKUNITY_UMAL_BASE + 0x0064)
-#define UMAL_FIFORAM2		(PKUNITY_UMAL_BASE + 0x0068)
-#define UMAL_FIFORAM3		(PKUNITY_UMAL_BASE + 0x006c)
-#define UMAL_FIFORAM4		(PKUNITY_UMAL_BASE + 0x0070)
-#define UMAL_FIFORAM5		(PKUNITY_UMAL_BASE + 0x0074)
-#define UMAL_FIFORAM6		(PKUNITY_UMAL_BASE + 0x0078)
-#define UMAL_FIFORAM7		(PKUNITY_UMAL_BASE + 0x007c)
-
-/* MAHBE MODULE OF UMAL */
-/* UMAL's MAHBE module interfaces to the host system through 32-bit AHB Master
- * and Slave ports.Registers within the M-AHBE provide Control and Status
- * information concerning these transfers.
- */
-/*
- * Transmit Control UMAL_DMATxCtrl
- */
-#define UMAL_DMATxCtrl		(PKUNITY_UMAL_BASE + 0x0180)
-/*
- * Pointer to TX Descripter UMAL_DMATxDescriptor
- */
-#define UMAL_DMATxDescriptor	(PKUNITY_UMAL_BASE + 0x0184)
-/*
- * Status of Tx Packet Transfers UMAL_DMATxStatus
- */
-#define UMAL_DMATxStatus	(PKUNITY_UMAL_BASE + 0x0188)
-/*
- * Receive Control UMAL_DMARxCtrl
- */
-#define UMAL_DMARxCtrl		(PKUNITY_UMAL_BASE + 0x018c)
-/*
- * Pointer to Rx Descriptor UMAL_DMARxDescriptor
- */
-#define UMAL_DMARxDescriptor	(PKUNITY_UMAL_BASE + 0x0190)
-/*
- * Status of Rx Packet Transfers UMAL_DMARxStatus
- */
-#define UMAL_DMARxStatus	(PKUNITY_UMAL_BASE + 0x0194)
-/*
- * Interrupt Mask UMAL_DMAIntrMask
- */
-#define UMAL_DMAIntrMask	(PKUNITY_UMAL_BASE + 0x0198)
-/*
- * Interrupts, read only UMAL_DMAInterrupt
- */
-#define UMAL_DMAInterrupt	(PKUNITY_UMAL_BASE + 0x019c)
-
-/*
- * Commands for UMAL_CFG1 register
- */
-#define UMAL_CFG1_TXENABLE	FIELD(1, 1, 0)
-#define UMAL_CFG1_RXENABLE	FIELD(1, 1, 2)
-#define UMAL_CFG1_TXFLOWCTL	FIELD(1, 1, 4)
-#define UMAL_CFG1_RXFLOWCTL	FIELD(1, 1, 5)
-#define UMAL_CFG1_CONFLPBK	FIELD(1, 1, 8)
-#define UMAL_CFG1_RESET		FIELD(1, 1, 31)
-#define UMAL_CFG1_CONFFLCTL	(MAC_TX_FLOW_CTL | MAC_RX_FLOW_CTL)
-
-/*
- * Commands for UMAL_CFG2 register
- */
-#define UMAL_CFG2_FULLDUPLEX	FIELD(1, 1, 0)
-#define UMAL_CFG2_CRCENABLE	FIELD(1, 1, 1)
-#define UMAL_CFG2_PADCRC	FIELD(1, 1, 2)
-#define UMAL_CFG2_LENGTHCHECK	FIELD(1, 1, 4)
-#define UMAL_CFG2_MODEMASK	FMASK(2, 8)
-#define UMAL_CFG2_NIBBLEMODE	FIELD(1, 2, 8)
-#define UMAL_CFG2_BYTEMODE	FIELD(2, 2, 8)
-#define UMAL_CFG2_PREAMBLENMASK	FMASK(4, 12)
-#define UMAL_CFG2_DEFPREAMBLEN	FIELD(7, 4, 12)
-#define UMAL_CFG2_FD100		(UMAL_CFG2_DEFPREAMBLEN | UMAL_CFG2_NIBBLEMODE \
-				| UMAL_CFG2_LENGTHCHECK | UMAL_CFG2_PADCRC \
-				| UMAL_CFG2_CRCENABLE | UMAL_CFG2_FULLDUPLEX)
-#define UMAL_CFG2_FD1000	(UMAL_CFG2_DEFPREAMBLEN | UMAL_CFG2_BYTEMODE \
-				| UMAL_CFG2_LENGTHCHECK | UMAL_CFG2_PADCRC \
-				| UMAL_CFG2_CRCENABLE | UMAL_CFG2_FULLDUPLEX)
-#define UMAL_CFG2_HD100		(UMAL_CFG2_DEFPREAMBLEN | UMAL_CFG2_NIBBLEMODE \
-				| UMAL_CFG2_LENGTHCHECK | UMAL_CFG2_PADCRC \
-				| UMAL_CFG2_CRCENABLE)
-
-/*
- * Command for UMAL_IFCTRL register
- */
-#define UMAL_IFCTRL_RESET	FIELD(1, 1, 31)
-
-/*
- * Command for UMAL_MIICFG register
- */
-#define UMAL_MIICFG_RESET	FIELD(1, 1, 31)
-
-/*
- * Command for UMAL_MIICMD register
- */
-#define UMAL_MIICMD_READ	FIELD(1, 1, 0)
-
-/*
- * Command for UMAL_MIIIDCT register
- */
-#define UMAL_MIIIDCT_BUSY	FIELD(1, 1, 0)
-#define UMAL_MIIIDCT_NOTVALID	FIELD(1, 1, 2)
-
-/*
- * Commands for DMATxCtrl regesters
- */
-#define UMAL_DMA_Enable		FIELD(1, 1, 0)
-
-/*
- * Commands for DMARxCtrl regesters
- */
-#define UMAL_DMAIntrMask_ENABLEHALFWORD	FIELD(1, 1, 16)
-
-/*
- * Command for DMARxStatus
- */
-#define CLR_RX_BUS_ERR		FIELD(1, 1, 3)
-#define CLR_RX_OVERFLOW		FIELD(1, 1, 2)
-#define CLR_RX_PKT		FIELD(1, 1, 0)
-
-/*
- * Command for DMATxStatus
- */
-#define CLR_TX_BUS_ERR		FIELD(1, 1, 3)
-#define CLR_TX_UNDERRUN		FIELD(1, 1, 1)
-#define CLR_TX_PKT		FIELD(1, 1, 0)
-
-/*
- * Commands for DMAIntrMask and DMAInterrupt register
- */
-#define INT_RX_MASK		FIELD(0xd, 4, 4)
-#define INT_TX_MASK		FIELD(0xb, 4, 0)
-
-#define INT_RX_BUS_ERR		FIELD(1, 1, 7)
-#define INT_RX_OVERFLOW		FIELD(1, 1, 6)
-#define INT_RX_PKT		FIELD(1, 1, 4)
-#define INT_TX_BUS_ERR		FIELD(1, 1, 3)
-#define INT_TX_UNDERRUN		FIELD(1, 1, 1)
-#define INT_TX_PKT		FIELD(1, 1, 0)
-
-/*
- * MARCOS of UMAL's descriptors
- */
-#define UMAL_DESC_PACKETSIZE_EMPTY	FIELD(1, 1, 31)
-#define UMAL_DESC_PACKETSIZE_NONEMPTY	FIELD(0, 1, 31)
-#define UMAL_DESC_PACKETSIZE_SIZEMASK	FMASK(12, 0)
-
diff --git a/arch/unicore32/include/mach/regs-unigfx.h b/arch/unicore32/include/mach/regs-unigfx.h
deleted file mode 100644
index 553d1157c6b2..000000000000
--- a/arch/unicore32/include/mach/regs-unigfx.h
+++ /dev/null
@@ -1,201 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * PKUnity UNIGFX Registers
- */
-
-#define UDE_BASE      (PKUNITY_UNIGFX_BASE + 0x1400)
-#define UGE_BASE      (PKUNITY_UNIGFX_BASE + 0x0000)
-
-/*
- * command reg for UNIGFX DE
- */
-/*
- * control reg UDE_CFG
- */
-#define UDE_CFG       (UDE_BASE + 0x0000)
-/*
- * framebuffer start address reg UDE_FSA
- */
-#define UDE_FSA       (UDE_BASE + 0x0004)
-/*
- * line size reg UDE_LS
- */
-#define UDE_LS        (UDE_BASE + 0x0008)
-/*
- * pitch size reg UDE_PS
- */
-#define UDE_PS        (UDE_BASE + 0x000C)
-/*
- * horizontal active time reg UDE_HAT
- */
-#define UDE_HAT       (UDE_BASE + 0x0010)
-/*
- * horizontal blank time reg UDE_HBT
- */
-#define UDE_HBT       (UDE_BASE + 0x0014)
-/*
- * horizontal sync time reg UDE_HST
- */
-#define UDE_HST       (UDE_BASE + 0x0018)
-/*
- * vertival active time reg UDE_VAT
- */
-#define UDE_VAT       (UDE_BASE + 0x001C)
-/*
- * vertival blank time reg UDE_VBT
- */
-#define UDE_VBT       (UDE_BASE + 0x0020)
-/*
- * vertival sync time reg UDE_VST
- */
-#define UDE_VST       (UDE_BASE + 0x0024)
-/*
- * cursor position UDE_CXY
- */
-#define UDE_CXY       (UDE_BASE + 0x0028)
-/*
- * cursor front color UDE_CC0
- */
-#define UDE_CC0       (UDE_BASE + 0x002C)
-/*
- * cursor background color UDE_CC1
- */
-#define UDE_CC1       (UDE_BASE + 0x0030)
-/*
- * video position UDE_VXY
- */
-#define UDE_VXY       (UDE_BASE + 0x0034)
-/*
- * video start address reg UDE_VSA
- */
-#define UDE_VSA       (UDE_BASE + 0x0040)
-/*
- * video size reg UDE_VS
- */
-#define UDE_VS        (UDE_BASE + 0x004C)
-
-/*
- * command reg for UNIGFX GE
- */
-/*
- * src xy reg UGE_SRCXY
- */
-#define UGE_SRCXY     (UGE_BASE + 0x0000)
-/*
- * dst xy reg UGE_DSTXY
- */
-#define UGE_DSTXY     (UGE_BASE + 0x0004)
-/*
- * pitch reg UGE_PITCH
- */
-#define UGE_PITCH     (UGE_BASE + 0x0008)
-/*
- * src start reg UGE_SRCSTART
- */
-#define UGE_SRCSTART  (UGE_BASE + 0x000C)
-/*
- * dst start reg UGE_DSTSTART
- */
-#define UGE_DSTSTART  (UGE_BASE + 0x0010)
-/*
- * width height reg UGE_WIDHEIGHT
- */
-#define UGE_WIDHEIGHT (UGE_BASE + 0x0014)
-/*
- * rop alpah reg UGE_ROPALPHA
- */
-#define UGE_ROPALPHA  (UGE_BASE + 0x0018)
-/*
- * front color UGE_FCOLOR
- */
-#define UGE_FCOLOR    (UGE_BASE + 0x001C)
-/*
- * background color UGE_BCOLOR
- */
-#define UGE_BCOLOR    (UGE_BASE + 0x0020)
-/*
- * src color key for high value UGE_SCH
- */
-#define UGE_SCH       (UGE_BASE + 0x0024)
-/*
- * dst color key for high value UGE_DCH
- */
-#define UGE_DCH       (UGE_BASE + 0x0028)
-/*
- * src color key for low value UGE_SCL
- */
-#define UGE_SCL       (UGE_BASE + 0x002C)
-/*
- * dst color key for low value UGE_DCL
- */
-#define UGE_DCL       (UGE_BASE + 0x0030)
-/*
- * clip 0 reg UGE_CLIP0
- */
-#define UGE_CLIP0     (UGE_BASE + 0x0034)
-/*
- * clip 1 reg UGE_CLIP1
- */
-#define UGE_CLIP1     (UGE_BASE + 0x0038)
-/*
- * command reg UGE_COMMAND
- */
-#define UGE_COMMAND   (UGE_BASE + 0x003C)
-/*
- * pattern 0 UGE_P0
- */
-#define UGE_P0        (UGE_BASE + 0x0040)
-#define UGE_P1        (UGE_BASE + 0x0044)
-#define UGE_P2        (UGE_BASE + 0x0048)
-#define UGE_P3        (UGE_BASE + 0x004C)
-#define UGE_P4        (UGE_BASE + 0x0050)
-#define UGE_P5        (UGE_BASE + 0x0054)
-#define UGE_P6        (UGE_BASE + 0x0058)
-#define UGE_P7        (UGE_BASE + 0x005C)
-#define UGE_P8        (UGE_BASE + 0x0060)
-#define UGE_P9        (UGE_BASE + 0x0064)
-#define UGE_P10       (UGE_BASE + 0x0068)
-#define UGE_P11       (UGE_BASE + 0x006C)
-#define UGE_P12       (UGE_BASE + 0x0070)
-#define UGE_P13       (UGE_BASE + 0x0074)
-#define UGE_P14       (UGE_BASE + 0x0078)
-#define UGE_P15       (UGE_BASE + 0x007C)
-#define UGE_P16       (UGE_BASE + 0x0080)
-#define UGE_P17       (UGE_BASE + 0x0084)
-#define UGE_P18       (UGE_BASE + 0x0088)
-#define UGE_P19       (UGE_BASE + 0x008C)
-#define UGE_P20       (UGE_BASE + 0x0090)
-#define UGE_P21       (UGE_BASE + 0x0094)
-#define UGE_P22       (UGE_BASE + 0x0098)
-#define UGE_P23       (UGE_BASE + 0x009C)
-#define UGE_P24       (UGE_BASE + 0x00A0)
-#define UGE_P25       (UGE_BASE + 0x00A4)
-#define UGE_P26       (UGE_BASE + 0x00A8)
-#define UGE_P27       (UGE_BASE + 0x00AC)
-#define UGE_P28       (UGE_BASE + 0x00B0)
-#define UGE_P29       (UGE_BASE + 0x00B4)
-#define UGE_P30       (UGE_BASE + 0x00B8)
-#define UGE_P31       (UGE_BASE + 0x00BC)
-
-#define UDE_CFG_DST_MASK	FMASK(2, 8)
-#define UDE_CFG_DST8            FIELD(0x0, 2, 8)
-#define UDE_CFG_DST16           FIELD(0x1, 2, 8)
-#define UDE_CFG_DST24           FIELD(0x2, 2, 8)
-#define UDE_CFG_DST32           FIELD(0x3, 2, 8)
-
-/*
- * GDEN enable UDE_CFG_GDEN_ENABLE
- */
-#define UDE_CFG_GDEN_ENABLE     FIELD(1, 1, 3)
-/*
- * VDEN enable UDE_CFG_VDEN_ENABLE
- */
-#define UDE_CFG_VDEN_ENABLE     FIELD(1, 1, 4)
-/*
- * CDEN enable UDE_CFG_CDEN_ENABLE
- */
-#define UDE_CFG_CDEN_ENABLE     FIELD(1, 1, 5)
-/*
- * TIMEUP enable UDE_CFG_TIMEUP_ENABLE
- */
-#define UDE_CFG_TIMEUP_ENABLE   FIELD(1, 1, 6)
diff --git a/arch/unicore32/include/mach/uncompress.h b/arch/unicore32/include/mach/uncompress.h
deleted file mode 100644
index 0c1a56a1913f..000000000000
--- a/arch/unicore32/include/mach/uncompress.h
+++ /dev/null
@@ -1,31 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/*
- * linux/arch/unicore32/include/mach/uncompress.h
- *
- * Code specific to PKUnity SoC and UniCore ISA
- *
- * Copyright (C) 2001-2010 GUAN Xue-tao
- */
-
-#ifndef __MACH_PUV3_UNCOMPRESS_H__
-#define __MACH_PUV3_UNCOMPRESS_H__
-
-#include <mach/hardware.h>
-#include <mach/ocd.h>
-
-extern char input_data[];
-extern char input_data_end[];
-
-static void arch_decomp_puts(const char *ptr)
-{
-	char c;
-
-	while ((c = *ptr++) != '\0') {
-		if (c == '\n')
-			putc('\r');
-		putc(c);
-	}
-}
-#define ARCH_HAVE_DECOMP_PUTS
-
-#endif /* __MACH_PUV3_UNCOMPRESS_H__ */
diff --git a/arch/unicore32/include/uapi/asm/Kbuild b/arch/unicore32/include/uapi/asm/Kbuild
deleted file mode 100644
index e78470141932..000000000000
--- a/arch/unicore32/include/uapi/asm/Kbuild
+++ /dev/null
@@ -1,2 +0,0 @@
-# SPDX-License-Identifier: GPL-2.0
-generic-y += ucontext.h
diff --git a/arch/unicore32/include/uapi/asm/byteorder.h b/arch/unicore32/include/uapi/asm/byteorder.h
deleted file mode 100644
index 864fe4814cf4..000000000000
--- a/arch/unicore32/include/uapi/asm/byteorder.h
+++ /dev/null
@@ -1,25 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
-/*
- * linux/arch/unicore32/include/asm/byteorder.h
- *
- * Code specific to PKUnity SoC and UniCore ISA
- *
- * Copyright (C) 2001-2010 GUAN Xue-tao
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
- * UniCore ONLY support Little Endian mode, the data bus is connected such
- * that byte accesses appear as:
- *  0 = d0...d7, 1 = d8...d15, 2 = d16...d23, 3 = d24...d31
- * and word accesses (data or instruction) appear as:
- *  d0...d31
- */
-#ifndef __UNICORE_BYTEORDER_H__
-#define __UNICORE_BYTEORDER_H__
-
-#include <linux/byteorder/little_endian.h>
-
-#endif
-
diff --git a/arch/unicore32/include/uapi/asm/ptrace.h b/arch/unicore32/include/uapi/asm/ptrace.h
deleted file mode 100644
index 2820de83e37d..000000000000
--- a/arch/unicore32/include/uapi/asm/ptrace.h
+++ /dev/null
@@ -1,91 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
-/*
- * linux/arch/unicore32/include/asm/ptrace.h
- *
- * Code specific to PKUnity SoC and UniCore ISA
- *
- * Copyright (C) 2001-2010 GUAN Xue-tao
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-#ifndef _UAPI__UNICORE_PTRACE_H__
-#define _UAPI__UNICORE_PTRACE_H__
-
-#define PTRACE_GET_THREAD_AREA	22
-
-/*
- * PSR bits
- */
-#define USER_MODE	0x00000010
-#define REAL_MODE	0x00000011
-#define INTR_MODE	0x00000012
-#define PRIV_MODE	0x00000013
-#define ABRT_MODE	0x00000017
-#define EXTN_MODE	0x0000001b
-#define SUSR_MODE	0x0000001f
-#define MODE_MASK	0x0000001f
-#define PSR_R_BIT	0x00000040
-#define PSR_I_BIT	0x00000080
-#define PSR_V_BIT	0x10000000
-#define PSR_C_BIT	0x20000000
-#define PSR_Z_BIT	0x40000000
-#define PSR_S_BIT	0x80000000
-
-/*
- * Groups of PSR bits
- */
-#define PSR_f		0xff000000	/* Flags		*/
-#define PSR_c		0x000000ff	/* Control		*/
-
-#ifndef __ASSEMBLY__
-
-/*
- * This struct defines the way the registers are stored on the
- * stack during a system call.  Note that sizeof(struct pt_regs)
- * has to be a multiple of 8.
- */
-struct pt_regs {
-	unsigned long uregs[34];
-};
-
-#define UCreg_asr		uregs[32]
-#define UCreg_pc		uregs[31]
-#define UCreg_lr		uregs[30]
-#define UCreg_sp		uregs[29]
-#define UCreg_ip		uregs[28]
-#define UCreg_fp		uregs[27]
-#define UCreg_26		uregs[26]
-#define UCreg_25		uregs[25]
-#define UCreg_24		uregs[24]
-#define UCreg_23		uregs[23]
-#define UCreg_22		uregs[22]
-#define UCreg_21		uregs[21]
-#define UCreg_20		uregs[20]
-#define UCreg_19		uregs[19]
-#define UCreg_18		uregs[18]
-#define UCreg_17		uregs[17]
-#define UCreg_16		uregs[16]
-#define UCreg_15		uregs[15]
-#define UCreg_14		uregs[14]
-#define UCreg_13		uregs[13]
-#define UCreg_12		uregs[12]
-#define UCreg_11		uregs[11]
-#define UCreg_10		uregs[10]
-#define UCreg_09		uregs[9]
-#define UCreg_08		uregs[8]
-#define UCreg_07		uregs[7]
-#define UCreg_06		uregs[6]
-#define UCreg_05		uregs[5]
-#define UCreg_04		uregs[4]
-#define UCreg_03		uregs[3]
-#define UCreg_02		uregs[2]
-#define UCreg_01		uregs[1]
-#define UCreg_00		uregs[0]
-#define UCreg_ORIG_00		uregs[33]
-
-
-#endif /* __ASSEMBLY__ */
-
-#endif /* _UAPI__UNICORE_PTRACE_H__ */
diff --git a/arch/unicore32/include/uapi/asm/sigcontext.h b/arch/unicore32/include/uapi/asm/sigcontext.h
deleted file mode 100644
index 79e56f28e4b5..000000000000
--- a/arch/unicore32/include/uapi/asm/sigcontext.h
+++ /dev/null
@@ -1,30 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
-/*
- * linux/arch/unicore32/include/asm/sigcontext.h
- *
- * Code specific to PKUnity SoC and UniCore ISA
- *
- * Copyright (C) 2001-2010 GUAN Xue-tao
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-#ifndef __UNICORE_SIGCONTEXT_H__
-#define __UNICORE_SIGCONTEXT_H__
-
-#include <asm/ptrace.h>
-/*
- * Signal context structure - contains all info to do with the state
- * before the signal handler was invoked.  Note: only add new entries
- * to the end of the structure.
- */
-struct sigcontext {
-	unsigned long trap_no;
-	unsigned long error_code;
-	unsigned long oldmask;
-	unsigned long fault_address;
-	struct pt_regs regs;
-};
-
-#endif
diff --git a/arch/unicore32/include/uapi/asm/unistd.h b/arch/unicore32/include/uapi/asm/unistd.h
deleted file mode 100644
index 54a7378a70b1..000000000000
--- a/arch/unicore32/include/uapi/asm/unistd.h
+++ /dev/null
@@ -1,21 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
-/*
- * linux/arch/unicore32/include/asm/unistd.h
- *
- * Code specific to PKUnity SoC and UniCore ISA
- *
- * Copyright (C) 2001-2010 GUAN Xue-tao
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-
-#define __ARCH_WANT_RENAMEAT
-#define __ARCH_WANT_SET_GET_RLIMIT
-#define __ARCH_WANT_STAT64
-#define __ARCH_WANT_TIME32_SYSCALLS
-
-/* Use the standard ABI for syscalls. */
-#include <asm-generic/unistd.h>
-#define __ARCH_WANT_SYS_CLONE
diff --git a/arch/unicore32/kernel/Makefile b/arch/unicore32/kernel/Makefile
deleted file mode 100644
index 2f79aa56735b..000000000000
--- a/arch/unicore32/kernel/Makefile
+++ /dev/null
@@ -1,31 +0,0 @@
-# SPDX-License-Identifier: GPL-2.0
-#
-# Makefile for the linux kernel.
-#
-
-# Object file lists.
-obj-y				:= dma.o elf.o entry.o process.o ptrace.o
-obj-y				+= setup.o signal.o sys.o stacktrace.o traps.o
-
-obj-$(CONFIG_MODULES)		+= ksyms.o module.o
-obj-$(CONFIG_EARLY_PRINTK)	+= early_printk.o
-
-obj-$(CONFIG_UNICORE_FPU_F64)	+= fpu-ucf64.o
-
-# obj-y for architecture PKUnity v3
-obj-$(CONFIG_ARCH_PUV3)		+= clock.o irq.o time.o
-
-obj-$(CONFIG_PUV3_GPIO)		+= gpio.o
-obj-$(CONFIG_PUV3_PM)		+= pm.o sleep.o
-obj-$(CONFIG_HIBERNATION)	+= hibernate.o hibernate_asm.o
-
-obj-$(CONFIG_PCI)		+= pci.o
-
-# obj-y for specific machines
-obj-$(CONFIG_ARCH_PUV3)		+= puv3-core.o
-obj-$(CONFIG_PUV3_NB0916)	+= puv3-nb0916.o
-
-head-y				:= head.o
-obj-$(CONFIG_DEBUG_LL)		+= debug.o
-
-extra-y				:= $(head-y) vmlinux.lds
diff --git a/arch/unicore32/kernel/asm-offsets.c b/arch/unicore32/kernel/asm-offsets.c
deleted file mode 100644
index f7d672267549..000000000000
--- a/arch/unicore32/kernel/asm-offsets.c
+++ /dev/null
@@ -1,108 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * linux/arch/unicore32/kernel/asm-offsets.c
- *
- * Code specific to PKUnity SoC and UniCore ISA
- *
- * Copyright (C) 2001-2010 GUAN Xue-tao
- *
- * Generate definitions needed by assembly language modules.
- * This code generates raw asm output which is post-processed to extract
- * and format the required data.
- */
-#include <linux/sched.h>
-#include <linux/mm.h>
-#include <linux/dma-mapping.h>
-#include <linux/kbuild.h>
-#include <linux/suspend.h>
-#include <linux/thread_info.h>
-#include <asm/memory.h>
-#include <asm/suspend.h>
-
-/*
- * GCC 3.0, 3.1: general bad code generation.
- * GCC 3.2.0: incorrect function argument offset calculation.
- * GCC 3.2.x: miscompiles NEW_AUX_ENT in fs/binfmt_elf.c
- *	(http://gcc.gnu.org/PR8896) and incorrect structure
- *		initialisation in fs/jffs2/erase.c
- */
-#if (__GNUC__ < 4)
-#error Your compiler should upgrade to uc4
-#error	Known good compilers: 4.2.2
-#endif
-
-int main(void)
-{
-	DEFINE(TSK_ACTIVE_MM,	offsetof(struct task_struct, active_mm));
-	BLANK();
-	DEFINE(TI_FLAGS,	offsetof(struct thread_info, flags));
-	DEFINE(TI_PREEMPT,	offsetof(struct thread_info, preempt_count));
-	DEFINE(TI_ADDR_LIMIT,	offsetof(struct thread_info, addr_limit));
-	DEFINE(TI_TASK,		offsetof(struct thread_info, task));
-	DEFINE(TI_CPU,		offsetof(struct thread_info, cpu));
-	DEFINE(TI_CPU_SAVE,	offsetof(struct thread_info, cpu_context));
-	DEFINE(TI_USED_CP,	offsetof(struct thread_info, used_cp));
-#ifdef CONFIG_UNICORE_FPU_F64
-	DEFINE(TI_FPSTATE,	offsetof(struct thread_info, fpstate));
-#endif
-	BLANK();
-	DEFINE(S_R0,		offsetof(struct pt_regs, UCreg_00));
-	DEFINE(S_R1,		offsetof(struct pt_regs, UCreg_01));
-	DEFINE(S_R2,		offsetof(struct pt_regs, UCreg_02));
-	DEFINE(S_R3,		offsetof(struct pt_regs, UCreg_03));
-	DEFINE(S_R4,		offsetof(struct pt_regs, UCreg_04));
-	DEFINE(S_R5,		offsetof(struct pt_regs, UCreg_05));
-	DEFINE(S_R6,		offsetof(struct pt_regs, UCreg_06));
-	DEFINE(S_R7,		offsetof(struct pt_regs, UCreg_07));
-	DEFINE(S_R8,		offsetof(struct pt_regs, UCreg_08));
-	DEFINE(S_R9,		offsetof(struct pt_regs, UCreg_09));
-	DEFINE(S_R10,		offsetof(struct pt_regs, UCreg_10));
-	DEFINE(S_R11,		offsetof(struct pt_regs, UCreg_11));
-	DEFINE(S_R12,		offsetof(struct pt_regs, UCreg_12));
-	DEFINE(S_R13,		offsetof(struct pt_regs, UCreg_13));
-	DEFINE(S_R14,		offsetof(struct pt_regs, UCreg_14));
-	DEFINE(S_R15,		offsetof(struct pt_regs, UCreg_15));
-	DEFINE(S_R16,		offsetof(struct pt_regs, UCreg_16));
-	DEFINE(S_R17,		offsetof(struct pt_regs, UCreg_17));
-	DEFINE(S_R18,		offsetof(struct pt_regs, UCreg_18));
-	DEFINE(S_R19,		offsetof(struct pt_regs, UCreg_19));
-	DEFINE(S_R20,		offsetof(struct pt_regs, UCreg_20));
-	DEFINE(S_R21,		offsetof(struct pt_regs, UCreg_21));
-	DEFINE(S_R22,		offsetof(struct pt_regs, UCreg_22));
-	DEFINE(S_R23,		offsetof(struct pt_regs, UCreg_23));
-	DEFINE(S_R24,		offsetof(struct pt_regs, UCreg_24));
-	DEFINE(S_R25,		offsetof(struct pt_regs, UCreg_25));
-	DEFINE(S_R26,		offsetof(struct pt_regs, UCreg_26));
-	DEFINE(S_FP,		offsetof(struct pt_regs, UCreg_fp));
-	DEFINE(S_IP,		offsetof(struct pt_regs, UCreg_ip));
-	DEFINE(S_SP,		offsetof(struct pt_regs, UCreg_sp));
-	DEFINE(S_LR,		offsetof(struct pt_regs, UCreg_lr));
-	DEFINE(S_PC,		offsetof(struct pt_regs, UCreg_pc));
-	DEFINE(S_PSR,		offsetof(struct pt_regs, UCreg_asr));
-	DEFINE(S_OLD_R0,	offsetof(struct pt_regs, UCreg_ORIG_00));
-	DEFINE(S_FRAME_SIZE,	sizeof(struct pt_regs));
-	BLANK();
-	DEFINE(VMA_VM_MM,	offsetof(struct vm_area_struct, vm_mm));
-	DEFINE(VMA_VM_FLAGS,	offsetof(struct vm_area_struct, vm_flags));
-	BLANK();
-	DEFINE(VM_EXEC,		VM_EXEC);
-	BLANK();
-	DEFINE(PAGE_SZ,		PAGE_SIZE);
-	BLANK();
-	DEFINE(SYS_ERROR0,	0x9f0000);
-	BLANK();
-	DEFINE(PBE_ADDRESS,		offsetof(struct pbe, address));
-	DEFINE(PBE_ORIN_ADDRESS,	offsetof(struct pbe, orig_address));
-	DEFINE(PBE_NEXT,		offsetof(struct pbe, next));
-	DEFINE(SWSUSP_CPU,		offsetof(struct swsusp_arch_regs, \
-							cpu_context));
-#ifdef	CONFIG_UNICORE_FPU_F64
-	DEFINE(SWSUSP_FPSTATE,		offsetof(struct swsusp_arch_regs, \
-							fpstate));
-#endif
-	BLANK();
-	DEFINE(DMA_BIDIRECTIONAL,	DMA_BIDIRECTIONAL);
-	DEFINE(DMA_TO_DEVICE,		DMA_TO_DEVICE);
-	DEFINE(DMA_FROM_DEVICE,		DMA_FROM_DEVICE);
-	return 0;
-}
diff --git a/arch/unicore32/kernel/clock.c b/arch/unicore32/kernel/clock.c
deleted file mode 100644
index 41df6be0a3b2..000000000000
--- a/arch/unicore32/kernel/clock.c
+++ /dev/null
@@ -1,387 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * linux/arch/unicore32/kernel/clock.c
- *
- * Code specific to PKUnity SoC and UniCore ISA
- *
- *	Maintained by GUAN Xue-tao <gxt@mprc.pku.edu.cn>
- *	Copyright (C) 2001-2010 Guan Xuetao
- */
-#include <linux/module.h>
-#include <linux/kernel.h>
-#include <linux/device.h>
-#include <linux/list.h>
-#include <linux/errno.h>
-#include <linux/err.h>
-#include <linux/string.h>
-#include <linux/clk.h>
-#include <linux/mutex.h>
-#include <linux/delay.h>
-#include <linux/io.h>
-
-#include <mach/hardware.h>
-
-/*
- * Very simple clock implementation
- */
-struct clk {
-	struct list_head	node;
-	unsigned long		rate;
-	const char		*name;
-};
-
-static struct clk clk_ost_clk = {
-	.name		= "OST_CLK",
-	.rate		= CLOCK_TICK_RATE,
-};
-
-static struct clk clk_mclk_clk = {
-	.name		= "MAIN_CLK",
-};
-
-static struct clk clk_bclk32_clk = {
-	.name		= "BUS32_CLK",
-};
-
-static struct clk clk_ddr_clk = {
-	.name		= "DDR_CLK",
-};
-
-static struct clk clk_vga_clk = {
-	.name		= "VGA_CLK",
-};
-
-static LIST_HEAD(clocks);
-static DEFINE_MUTEX(clocks_mutex);
-
-struct clk *clk_get(struct device *dev, const char *id)
-{
-	struct clk *p, *clk = ERR_PTR(-ENOENT);
-
-	mutex_lock(&clocks_mutex);
-	list_for_each_entry(p, &clocks, node) {
-		if (strcmp(id, p->name) == 0) {
-			clk = p;
-			break;
-		}
-	}
-	mutex_unlock(&clocks_mutex);
-
-	return clk;
-}
-EXPORT_SYMBOL(clk_get);
-
-void clk_put(struct clk *clk)
-{
-}
-EXPORT_SYMBOL(clk_put);
-
-int clk_enable(struct clk *clk)
-{
-	return 0;
-}
-EXPORT_SYMBOL(clk_enable);
-
-void clk_disable(struct clk *clk)
-{
-}
-EXPORT_SYMBOL(clk_disable);
-
-unsigned long clk_get_rate(struct clk *clk)
-{
-	return clk->rate;
-}
-EXPORT_SYMBOL(clk_get_rate);
-
-struct {
-	unsigned long rate;
-	unsigned long cfg;
-	unsigned long div;
-} vga_clk_table[] = {
-	{.rate =  25175000, .cfg = 0x00002001, .div = 0x9},
-	{.rate =  31500000, .cfg = 0x00002001, .div = 0x7},
-	{.rate =  40000000, .cfg = 0x00003801, .div = 0x9},
-	{.rate =  49500000, .cfg = 0x00003801, .div = 0x7},
-	{.rate =  65000000, .cfg = 0x00002c01, .div = 0x4},
-	{.rate =  78750000, .cfg = 0x00002400, .div = 0x7},
-	{.rate = 108000000, .cfg = 0x00002c01, .div = 0x2},
-	{.rate = 106500000, .cfg = 0x00003c01, .div = 0x3},
-	{.rate =  50650000, .cfg = 0x00106400, .div = 0x9},
-	{.rate =  61500000, .cfg = 0x00106400, .div = 0xa},
-	{.rate =  85500000, .cfg = 0x00002800, .div = 0x6},
-};
-
-struct {
-	unsigned long mrate;
-	unsigned long prate;
-} mclk_clk_table[] = {
-	{.mrate = 500000000, .prate = 0x00109801},
-	{.mrate = 525000000, .prate = 0x00104C00},
-	{.mrate = 550000000, .prate = 0x00105000},
-	{.mrate = 575000000, .prate = 0x00105400},
-	{.mrate = 600000000, .prate = 0x00105800},
-	{.mrate = 625000000, .prate = 0x00105C00},
-	{.mrate = 650000000, .prate = 0x00106000},
-	{.mrate = 675000000, .prate = 0x00106400},
-	{.mrate = 700000000, .prate = 0x00106800},
-	{.mrate = 725000000, .prate = 0x00106C00},
-	{.mrate = 750000000, .prate = 0x00107000},
-	{.mrate = 775000000, .prate = 0x00107400},
-	{.mrate = 800000000, .prate = 0x00107800},
-};
-
-int clk_set_rate(struct clk *clk, unsigned long rate)
-{
-	if (clk == &clk_vga_clk) {
-		unsigned long pll_vgacfg, pll_vgadiv;
-		int ret, i;
-
-		/* lookup vga_clk_table */
-		ret = -EINVAL;
-		for (i = 0; i < ARRAY_SIZE(vga_clk_table); i++) {
-			if (rate == vga_clk_table[i].rate) {
-				pll_vgacfg = vga_clk_table[i].cfg;
-				pll_vgadiv = vga_clk_table[i].div;
-				ret = 0;
-				break;
-			}
-		}
-
-		if (ret)
-			return ret;
-
-		if (readl(PM_PLLVGACFG) == pll_vgacfg)
-			return 0;
-
-		/* set pll vga cfg reg. */
-		writel(pll_vgacfg, PM_PLLVGACFG);
-
-		writel(PM_PMCR_CFBVGA, PM_PMCR);
-		while ((readl(PM_PLLDFCDONE) & PM_PLLDFCDONE_VGADFC)
-				!= PM_PLLDFCDONE_VGADFC)
-			udelay(100); /* about 1ms */
-
-		/* set div cfg reg. */
-		writel(readl(PM_PCGR) | PM_PCGR_VGACLK, PM_PCGR);
-
-		writel((readl(PM_DIVCFG) & ~PM_DIVCFG_VGACLK_MASK)
-				| PM_DIVCFG_VGACLK(pll_vgadiv), PM_DIVCFG);
-
-		writel(readl(PM_SWRESET) | PM_SWRESET_VGADIV, PM_SWRESET);
-		while ((readl(PM_SWRESET) & PM_SWRESET_VGADIV)
-				== PM_SWRESET_VGADIV)
-			udelay(100); /* 65536 bclk32, about 320us */
-
-		writel(readl(PM_PCGR) & ~PM_PCGR_VGACLK, PM_PCGR);
-	}
-#ifdef CONFIG_CPU_FREQ
-	if (clk == &clk_mclk_clk) {
-		u32 pll_rate, divstatus = readl(PM_DIVSTATUS);
-		int ret, i;
-
-		/* lookup mclk_clk_table */
-		ret = -EINVAL;
-		for (i = 0; i < ARRAY_SIZE(mclk_clk_table); i++) {
-			if (rate == mclk_clk_table[i].mrate) {
-				pll_rate = mclk_clk_table[i].prate;
-				clk_mclk_clk.rate = mclk_clk_table[i].mrate;
-				ret = 0;
-				break;
-			}
-		}
-
-		if (ret)
-			return ret;
-
-		if (clk_mclk_clk.rate)
-			clk_bclk32_clk.rate = clk_mclk_clk.rate
-				/ (((divstatus & 0x0000f000) >> 12) + 1);
-
-		/* set pll sys cfg reg. */
-		writel(pll_rate, PM_PLLSYSCFG);
-
-		writel(PM_PMCR_CFBSYS, PM_PMCR);
-		while ((readl(PM_PLLDFCDONE) & PM_PLLDFCDONE_SYSDFC)
-				!= PM_PLLDFCDONE_SYSDFC)
-			udelay(100);
-			/* about 1ms */
-	}
-#endif
-	return 0;
-}
-EXPORT_SYMBOL(clk_set_rate);
-
-int clk_register(struct clk *clk)
-{
-	mutex_lock(&clocks_mutex);
-	list_add(&clk->node, &clocks);
-	mutex_unlock(&clocks_mutex);
-	printk(KERN_DEFAULT "PKUnity PM: %s %lu.%02luM\n", clk->name,
-		(clk->rate)/1000000, (clk->rate)/10000 % 100);
-	return 0;
-}
-EXPORT_SYMBOL(clk_register);
-
-void clk_unregister(struct clk *clk)
-{
-	mutex_lock(&clocks_mutex);
-	list_del(&clk->node);
-	mutex_unlock(&clocks_mutex);
-}
-EXPORT_SYMBOL(clk_unregister);
-
-struct {
-	unsigned long prate;
-	unsigned long rate;
-} pllrate_table[] = {
-	{.prate = 0x00002001, .rate = 250000000},
-	{.prate = 0x00104801, .rate = 250000000},
-	{.prate = 0x00104C01, .rate = 262500000},
-	{.prate = 0x00002401, .rate = 275000000},
-	{.prate = 0x00105001, .rate = 275000000},
-	{.prate = 0x00105401, .rate = 287500000},
-	{.prate = 0x00002801, .rate = 300000000},
-	{.prate = 0x00105801, .rate = 300000000},
-	{.prate = 0x00105C01, .rate = 312500000},
-	{.prate = 0x00002C01, .rate = 325000000},
-	{.prate = 0x00106001, .rate = 325000000},
-	{.prate = 0x00106401, .rate = 337500000},
-	{.prate = 0x00003001, .rate = 350000000},
-	{.prate = 0x00106801, .rate = 350000000},
-	{.prate = 0x00106C01, .rate = 362500000},
-	{.prate = 0x00003401, .rate = 375000000},
-	{.prate = 0x00107001, .rate = 375000000},
-	{.prate = 0x00107401, .rate = 387500000},
-	{.prate = 0x00003801, .rate = 400000000},
-	{.prate = 0x00107801, .rate = 400000000},
-	{.prate = 0x00107C01, .rate = 412500000},
-	{.prate = 0x00003C01, .rate = 425000000},
-	{.prate = 0x00108001, .rate = 425000000},
-	{.prate = 0x00108401, .rate = 437500000},
-	{.prate = 0x00004001, .rate = 450000000},
-	{.prate = 0x00108801, .rate = 450000000},
-	{.prate = 0x00108C01, .rate = 462500000},
-	{.prate = 0x00004401, .rate = 475000000},
-	{.prate = 0x00109001, .rate = 475000000},
-	{.prate = 0x00109401, .rate = 487500000},
-	{.prate = 0x00004801, .rate = 500000000},
-	{.prate = 0x00109801, .rate = 500000000},
-	{.prate = 0x00104C00, .rate = 525000000},
-	{.prate = 0x00002400, .rate = 550000000},
-	{.prate = 0x00105000, .rate = 550000000},
-	{.prate = 0x00105400, .rate = 575000000},
-	{.prate = 0x00002800, .rate = 600000000},
-	{.prate = 0x00105800, .rate = 600000000},
-	{.prate = 0x00105C00, .rate = 625000000},
-	{.prate = 0x00002C00, .rate = 650000000},
-	{.prate = 0x00106000, .rate = 650000000},
-	{.prate = 0x00106400, .rate = 675000000},
-	{.prate = 0x00003000, .rate = 700000000},
-	{.prate = 0x00106800, .rate = 700000000},
-	{.prate = 0x00106C00, .rate = 725000000},
-	{.prate = 0x00003400, .rate = 750000000},
-	{.prate = 0x00107000, .rate = 750000000},
-	{.prate = 0x00107400, .rate = 775000000},
-	{.prate = 0x00003800, .rate = 800000000},
-	{.prate = 0x00107800, .rate = 800000000},
-	{.prate = 0x00107C00, .rate = 825000000},
-	{.prate = 0x00003C00, .rate = 850000000},
-	{.prate = 0x00108000, .rate = 850000000},
-	{.prate = 0x00108400, .rate = 875000000},
-	{.prate = 0x00004000, .rate = 900000000},
-	{.prate = 0x00108800, .rate = 900000000},
-	{.prate = 0x00108C00, .rate = 925000000},
-	{.prate = 0x00004400, .rate = 950000000},
-	{.prate = 0x00109000, .rate = 950000000},
-	{.prate = 0x00109400, .rate = 975000000},
-	{.prate = 0x00004800, .rate = 1000000000},
-	{.prate = 0x00109800, .rate = 1000000000},
-};
-
-struct {
-	unsigned long prate;
-	unsigned long drate;
-} pddr_table[] = {
-	{.prate = 0x00100800, .drate = 44236800},
-	{.prate = 0x00100C00, .drate = 66355200},
-	{.prate = 0x00101000, .drate = 88473600},
-	{.prate = 0x00101400, .drate = 110592000},
-	{.prate = 0x00101800, .drate = 132710400},
-	{.prate = 0x00101C01, .drate = 154828800},
-	{.prate = 0x00102001, .drate = 176947200},
-	{.prate = 0x00102401, .drate = 199065600},
-	{.prate = 0x00102801, .drate = 221184000},
-	{.prate = 0x00102C01, .drate = 243302400},
-	{.prate = 0x00103001, .drate = 265420800},
-	{.prate = 0x00103401, .drate = 287539200},
-	{.prate = 0x00103801, .drate = 309657600},
-	{.prate = 0x00103C01, .drate = 331776000},
-	{.prate = 0x00104001, .drate = 353894400},
-};
-
-static int __init clk_init(void)
-{
-#ifdef CONFIG_PUV3_PM
-	u32 pllrate, divstatus = readl(PM_DIVSTATUS);
-	u32 pcgr_val = readl(PM_PCGR);
-	int i;
-
-	pcgr_val |= PM_PCGR_BCLKMME | PM_PCGR_BCLKH264E | PM_PCGR_BCLKH264D
-			| PM_PCGR_HECLK | PM_PCGR_HDCLK;
-	writel(pcgr_val, PM_PCGR);
-
-	pllrate = readl(PM_PLLSYSSTATUS);
-
-	/* lookup pmclk_table */
-	clk_mclk_clk.rate = 0;
-	for (i = 0; i < ARRAY_SIZE(pllrate_table); i++) {
-		if (pllrate == pllrate_table[i].prate) {
-			clk_mclk_clk.rate = pllrate_table[i].rate;
-			break;
-		}
-	}
-
-	if (clk_mclk_clk.rate)
-		clk_bclk32_clk.rate = clk_mclk_clk.rate /
-			(((divstatus & 0x0000f000) >> 12) + 1);
-
-	pllrate = readl(PM_PLLDDRSTATUS);
-
-	/* lookup pddr_table */
-	clk_ddr_clk.rate = 0;
-	for (i = 0; i < ARRAY_SIZE(pddr_table); i++) {
-		if (pllrate == pddr_table[i].prate) {
-			clk_ddr_clk.rate = pddr_table[i].drate;
-			break;
-		}
-	}
-
-	pllrate = readl(PM_PLLVGASTATUS);
-
-	/* lookup pvga_table */
-	clk_vga_clk.rate = 0;
-	for (i = 0; i < ARRAY_SIZE(pllrate_table); i++) {
-		if (pllrate == pllrate_table[i].prate) {
-			clk_vga_clk.rate = pllrate_table[i].rate;
-			break;
-		}
-	}
-
-	if (clk_vga_clk.rate)
-		clk_vga_clk.rate = clk_vga_clk.rate /
-			(((divstatus & 0x00f00000) >> 20) + 1);
-
-	clk_register(&clk_vga_clk);
-#endif
-#ifdef CONFIG_ARCH_FPGA
-	clk_ddr_clk.rate = 33000000;
-	clk_mclk_clk.rate = 33000000;
-	clk_bclk32_clk.rate = 33000000;
-#endif
-	clk_register(&clk_ddr_clk);
-	clk_register(&clk_mclk_clk);
-	clk_register(&clk_bclk32_clk);
-	clk_register(&clk_ost_clk);
-	return 0;
-}
-core_initcall(clk_init);
diff --git a/arch/unicore32/kernel/debug-macro.S b/arch/unicore32/kernel/debug-macro.S
deleted file mode 100644
index 7e2da0de4f71..000000000000
--- a/arch/unicore32/kernel/debug-macro.S
+++ /dev/null
@@ -1,86 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/*
- * linux/arch/unicore32/kernel/debug-macro.S
- *
- * Code specific to PKUnity SoC and UniCore ISA
- *
- * Copyright (C) 2001-2010 GUAN Xue-tao
- *
- * Debugging macro include header
- */
-#include <generated/asm-offsets.h>
-#include <mach/hardware.h>
-
-		.macro	put_word_ocd, rd, rx=r16
-1001:		movc		\rx, p1.c0, #0
-		cand.a	\rx, #2
-		bne	1001b
-		movc		p1.c1, \rd, #1
-		.endm
-
-#ifdef CONFIG_DEBUG_OCD
-		/* debug using UniCore On-Chip-Debugger */
-		.macro	addruart, rx
-		.endm
-
-		.macro	senduart, rd, rx
-		put_word_ocd	\rd, \rx
-		.endm
-
-		.macro	busyuart, rd, rx
-		.endm
-
-		.macro	waituart, rd, rx
-		.endm
-#else
-#define UART_CLK_DEFAULT        3686400 * 20
-	/* Uartclk = MCLK/ 2, The MCLK on my board is 3686400 * 40  */
-#define BAUD_RATE_DEFAULT	115200
-	/* The baud rate of the serial port */
-
-#define UART_DIVISOR_DEFAULT	(UART_CLK_DEFAULT \
-				/ (16 * BAUD_RATE_DEFAULT) - 1)
-
-		.macro	addruart,rx
-		mrc	p0, #0, \rx, c1, c0
-		tst	\rx, #1			@ MMU enabled?
-		moveq	\rx, #0xee000000	@ physical base address
-		movne	\rx, #0x6e000000	@ virtual address
-
-		@ We probe for the active serial port here
-		@ However, now we assume UART0 is active:	epip4d
-		@ We assume r1 and r2 can be clobbered.
-
-		movl 	r2, #UART_DIVISOR_DEFAULT
-		mov 	r1, #0x80
-		str	r1, [\rx, #UART_LCR_OFFSET]
-		and	r1, r2, #0xff00
-		mov	r1, r1, lsr #8
-		str	r1, [\rx, #UART_DLH_OFFSET]
-		and	r1, r2, #0xff
-		str	r1, [\rx, #UART_DLL_OFFSET]
-		mov 	r1, #0x7
-		str	r1, [\rx, #UART_FCR_OFFSET]
-		mov 	r1, #0x3
-		str	r1, [\rx, #UART_LCR_OFFSET]
-		mov 	r1, #0x0
-		str	r1, [\rx, #UART_IER_OFFSET]
-		.endm
-
-		.macro	senduart,rd,rx
-		str	\rd, [\rx, #UART_THR_OFFSET]
-		.endm
-
-		.macro	waituart,rd,rx
-1001:		ldr	\rd, [\rx, #UART_LSR_OFFSET]
-		tst	\rd, #UART_LSR_THRE
-		beq	1001b
-		.endm
-
-		.macro	busyuart,rd,rx
-1001:		ldr	\rd, [\rx, #UART_LSR_OFFSET]
-		tst	\rd, #UART_LSR_TEMT
-		bne	1001b
-		.endm
-#endif
-
diff --git a/arch/unicore32/kernel/debug.S b/arch/unicore32/kernel/debug.S
deleted file mode 100644
index 13bc8c8550e4..000000000000
--- a/arch/unicore32/kernel/debug.S
+++ /dev/null
@@ -1,82 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/*
- * linux/arch/unicore32/kernel/debug.S
- *
- * Code specific to PKUnity SoC and UniCore ISA
- *
- * Copyright (C) 2001-2010 GUAN Xue-tao
- *
- *  32-bit debugging code
- */
-#include <linux/linkage.h>
-#include <asm/assembler.h>
-
-		.text
-
-/*
- * Some debugging routines (useful if you've got MM problems and
- * printk isn't working).  For DEBUGGING ONLY!!!  Do not leave
- * references to these in a production kernel!
- */
-#include "debug-macro.S"
-
-/*
- * Useful debugging routines
- */
-ENTRY(printhex8)
-		mov	r1, #8
-		b	printhex
-ENDPROC(printhex8)
-
-ENTRY(printhex4)
-		mov	r1, #4
-		b	printhex
-ENDPROC(printhex4)
-
-ENTRY(printhex2)
-		mov	r1, #2
-printhex:	adr	r2, hexbuf
-		add	r3, r2, r1
-		mov	r1, #0
-		stb	r1, [r3]
-1:		and	r1, r0, #15
-		mov	r0, r0 >> #4
-		csub.a	r1, #10
-		beg	2f
-		add	r1, r1, #'0' - 'a' + 10
-2:		add	r1, r1, #'a' - 10
-		stb.w	r1, [r3+], #-1
-		cxor.a	r3, r2
-		bne	1b
-		mov	r0, r2
-		b	printascii
-ENDPROC(printhex2)
-
-		.ltorg
-
-ENTRY(printascii)
-		addruart r3
-		b	2f
-1:		waituart r2, r3
-		senduart r1, r3
-		busyuart r2, r3
-		cxor.a	r1, #'\n'
-		cmoveq	r1, #'\r'
-		beq	1b
-2:		cxor.a	r0, #0
-		beq	3f
-		ldb.w	r1, [r0]+, #1
-		cxor.a	r1, #0
-		bne	1b
-3:		mov	pc, lr
-ENDPROC(printascii)
-
-ENTRY(printch)
-		addruart r3
-		mov	r1, r0
-		mov	r0, #0
-		b	1b
-ENDPROC(printch)
-
-hexbuf:		.space 16
-
diff --git a/arch/unicore32/kernel/dma.c b/arch/unicore32/kernel/dma.c
deleted file mode 100644
index 7a0e2d4d6077..000000000000
--- a/arch/unicore32/kernel/dma.c
+++ /dev/null
@@ -1,179 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * linux/arch/unicore32/kernel/dma.c
- *
- * Code specific to PKUnity SoC and UniCore ISA
- *
- *	Maintained by GUAN Xue-tao <gxt@mprc.pku.edu.cn>
- *	Copyright (C) 2001-2010 Guan Xuetao
- */
-
-#include <linux/module.h>
-#include <linux/init.h>
-#include <linux/kernel.h>
-#include <linux/interrupt.h>
-#include <linux/errno.h>
-#include <linux/io.h>
-
-#include <asm/irq.h>
-#include <mach/hardware.h>
-#include <mach/dma.h>
-
-struct dma_channel {
-	char *name;
-	puv3_dma_prio prio;
-	void (*irq_handler)(int, void *);
-	void (*err_handler)(int, void *);
-	void *data;
-};
-
-static struct dma_channel dma_channels[MAX_DMA_CHANNELS];
-
-int puv3_request_dma(char *name, puv3_dma_prio prio,
-			 void (*irq_handler)(int, void *),
-			 void (*err_handler)(int, void *),
-			 void *data)
-{
-	unsigned long flags;
-	int i, found = 0;
-
-	/* basic sanity checks */
-	if (!name)
-		return -EINVAL;
-
-	local_irq_save(flags);
-
-	do {
-		/* try grabbing a DMA channel with the requested priority */
-		for (i = 0; i < MAX_DMA_CHANNELS; i++) {
-			if ((dma_channels[i].prio == prio) &&
-			    !dma_channels[i].name) {
-				found = 1;
-				break;
-			}
-		}
-		/* if requested prio group is full, try a hier priority */
-	} while (!found && prio--);
-
-	if (found) {
-		dma_channels[i].name = name;
-		dma_channels[i].irq_handler = irq_handler;
-		dma_channels[i].err_handler = err_handler;
-		dma_channels[i].data = data;
-	} else {
-		printk(KERN_WARNING "No more available DMA channels for %s\n",
-				name);
-		i = -ENODEV;
-	}
-
-	local_irq_restore(flags);
-	return i;
-}
-EXPORT_SYMBOL(puv3_request_dma);
-
-void puv3_free_dma(int dma_ch)
-{
-	unsigned long flags;
-
-	if (!dma_channels[dma_ch].name) {
-		printk(KERN_CRIT
-			"%s: trying to free channel %d which is already freed\n",
-			__func__, dma_ch);
-		return;
-	}
-
-	local_irq_save(flags);
-	dma_channels[dma_ch].name = NULL;
-	dma_channels[dma_ch].err_handler = NULL;
-	local_irq_restore(flags);
-}
-EXPORT_SYMBOL(puv3_free_dma);
-
-static irqreturn_t dma_irq_handler(int irq, void *dev_id)
-{
-	int i, dint;
-
-	dint = readl(DMAC_ITCSR);
-	for (i = 0; i < MAX_DMA_CHANNELS; i++) {
-		if (dint & DMAC_CHANNEL(i)) {
-			struct dma_channel *channel = &dma_channels[i];
-
-			/* Clear TC interrupt of channel i */
-			writel(DMAC_CHANNEL(i), DMAC_ITCCR);
-			writel(0, DMAC_ITCCR);
-
-			if (channel->name && channel->irq_handler) {
-				channel->irq_handler(i, channel->data);
-			} else {
-				/*
-				 * IRQ for an unregistered DMA channel:
-				 * let's clear the interrupts and disable it.
-				 */
-				printk(KERN_WARNING "spurious IRQ for"
-						" DMA channel %d\n", i);
-			}
-		}
-	}
-	return IRQ_HANDLED;
-}
-
-static irqreturn_t dma_err_handler(int irq, void *dev_id)
-{
-	int i, dint;
-
-	dint = readl(DMAC_IESR);
-	for (i = 0; i < MAX_DMA_CHANNELS; i++) {
-		if (dint & DMAC_CHANNEL(i)) {
-			struct dma_channel *channel = &dma_channels[i];
-
-			/* Clear Err interrupt of channel i */
-			writel(DMAC_CHANNEL(i), DMAC_IECR);
-			writel(0, DMAC_IECR);
-
-			if (channel->name && channel->err_handler) {
-				channel->err_handler(i, channel->data);
-			} else {
-				/*
-				 * IRQ for an unregistered DMA channel:
-				 * let's clear the interrupts and disable it.
-				 */
-				printk(KERN_WARNING "spurious IRQ for"
-						" DMA channel %d\n", i);
-			}
-		}
-	}
-	return IRQ_HANDLED;
-}
-
-int __init puv3_init_dma(void)
-{
-	int i, ret;
-
-	/* dma channel priorities on v8 processors:
-	 * ch 0 - 1  <--> (0) DMA_PRIO_HIGH
-	 * ch 2 - 3  <--> (1) DMA_PRIO_MEDIUM
-	 * ch 4 - 5  <--> (2) DMA_PRIO_LOW
-	 */
-	for (i = 0; i < MAX_DMA_CHANNELS; i++) {
-		puv3_stop_dma(i);
-		dma_channels[i].name = NULL;
-		dma_channels[i].prio = min((i & 0x7) >> 1, DMA_PRIO_LOW);
-	}
-
-	ret = request_irq(IRQ_DMA, dma_irq_handler, 0, "DMA", NULL);
-	if (ret) {
-		printk(KERN_CRIT "Can't register IRQ for DMA\n");
-		return ret;
-	}
-
-	ret = request_irq(IRQ_DMAERR, dma_err_handler, 0, "DMAERR", NULL);
-	if (ret) {
-		printk(KERN_CRIT "Can't register IRQ for DMAERR\n");
-		free_irq(IRQ_DMA, "DMA");
-		return ret;
-	}
-
-	return 0;
-}
-
-postcore_initcall(puv3_init_dma);
diff --git a/arch/unicore32/kernel/early_printk.c b/arch/unicore32/kernel/early_printk.c
deleted file mode 100644
index c00b6712b8f7..000000000000
--- a/arch/unicore32/kernel/early_printk.c
+++ /dev/null
@@ -1,46 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * linux/arch/unicore32/kernel/early_printk.c
- *
- * Code specific to PKUnity SoC and UniCore ISA
- *
- * Copyright (C) 2001-2010 GUAN Xue-tao
- */
-#include <linux/console.h>
-#include <linux/init.h>
-#include <linux/string.h>
-#include <mach/ocd.h>
-
-/* On-Chip-Debugger functions */
-
-static void early_ocd_write(struct console *con, const char *s, unsigned n)
-{
-	while (*s && n-- > 0) {
-		if (*s == '\n')
-			ocd_putc((int)'\r');
-		ocd_putc((int)*s);
-		s++;
-	}
-}
-
-static struct console early_ocd_console = {
-	.name =		"earlyocd",
-	.write =	early_ocd_write,
-	.flags =	CON_PRINTBUFFER,
-	.index =	-1,
-};
-
-static int __init setup_early_printk(char *buf)
-{
-	if (!buf || early_console)
-		return 0;
-
-	early_console = &early_ocd_console;
-	if (strstr(buf, "keep"))
-		early_console->flags &= ~CON_BOOT;
-	else
-		early_console->flags |= CON_BOOT;
-	register_console(early_console);
-	return 0;
-}
-early_param("earlyprintk", setup_early_printk);
diff --git a/arch/unicore32/kernel/elf.c b/arch/unicore32/kernel/elf.c
deleted file mode 100644
index 22adc65a03e9..000000000000
--- a/arch/unicore32/kernel/elf.c
+++ /dev/null
@@ -1,35 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * linux/arch/unicore32/kernel/elf.c
- *
- * Code specific to PKUnity SoC and UniCore ISA
- *
- * Copyright (C) 2001-2010 GUAN Xue-tao
- */
-#include <linux/module.h>
-#include <linux/sched.h>
-#include <linux/personality.h>
-#include <linux/binfmts.h>
-#include <linux/elf.h>
-
-int elf_check_arch(const struct elf32_hdr *x)
-{
-	/* Make sure it's an UniCore executable */
-	if (x->e_machine != EM_UNICORE)
-		return 0;
-
-	/* Make sure the entry address is reasonable */
-	if (x->e_entry & 3)
-		return 0;
-
-	return 1;
-}
-EXPORT_SYMBOL(elf_check_arch);
-
-void elf_set_personality(const struct elf32_hdr *x)
-{
-	unsigned int personality = PER_LINUX;
-
-	set_personality(personality);
-}
-EXPORT_SYMBOL(elf_set_personality);
diff --git a/arch/unicore32/kernel/entry.S b/arch/unicore32/kernel/entry.S
deleted file mode 100644
index b35dc83069cb..000000000000
--- a/arch/unicore32/kernel/entry.S
+++ /dev/null
@@ -1,802 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/*
- * linux/arch/unicore32/kernel/entry.S
- *
- * Code specific to PKUnity SoC and UniCore ISA
- *
- * Copyright (C) 2001-2010 GUAN Xue-tao
- *
- *  Low-level vector interface routines
- */
-#include <linux/init.h>
-#include <linux/linkage.h>
-#include <asm/assembler.h>
-#include <asm/errno.h>
-#include <asm/thread_info.h>
-#include <asm/memory.h>
-#include <asm/unistd.h>
-#include <generated/asm-offsets.h>
-#include "debug-macro.S"
-
-@
-@ Most of the stack format comes from struct pt_regs, but with
-@ the addition of 8 bytes for storing syscall args 5 and 6.
-@
-#define S_OFF		8
-
-/*
- * The SWI code relies on the fact that R0 is at the bottom of the stack
- * (due to slow/fast restore user regs).
- */
-#if S_R0 != 0
-#error "Please fix"
-#endif
-
-	.macro	zero_fp
-#ifdef CONFIG_FRAME_POINTER
-	mov	fp, #0
-#endif
-	.endm
-
-	.macro	alignment_trap, rtemp
-#ifdef CONFIG_ALIGNMENT_TRAP
-	ldw	\rtemp, .LCcralign
-	ldw	\rtemp, [\rtemp]
-	movc	p0.c1, \rtemp, #0
-#endif
-	.endm
-
-	.macro	load_user_sp_lr, rd, rtemp, offset = 0
-	mov	\rtemp, asr
-	xor	\rtemp, \rtemp, #(PRIV_MODE ^ SUSR_MODE)
-	mov.a	asr, \rtemp			@ switch to the SUSR mode
-
-	ldw	sp, [\rd+], #\offset		@ load sp_user
-	ldw	lr, [\rd+], #\offset + 4	@ load lr_user
-
-	xor	\rtemp, \rtemp, #(PRIV_MODE ^ SUSR_MODE)
-	mov.a	asr, \rtemp			@ switch back to the PRIV mode
-	.endm
-
-	.macro	priv_exit, rpsr
-	mov.a	bsr, \rpsr
-	ldm.w	(r0 - r15), [sp]+
-	ldm.b	(r16 - pc), [sp]+		@ load r0 - pc, asr
-	.endm
-
-	.macro	restore_user_regs, fast = 0, offset = 0
-	ldw	r1, [sp+], #\offset + S_PSR	@ get calling asr
-	ldw	lr, [sp+], #\offset + S_PC	@ get pc
-	mov.a	bsr, r1				@ save in bsr_priv
-	.if	\fast
-	add	sp, sp, #\offset + S_R1		@ r0 is syscall return value
-	ldm.w	(r1 - r15), [sp]+		@ get calling r1 - r15
-	ldur	(r16 - lr), [sp]+		@ get calling r16 - lr
-	.else
-	ldm.w	(r0 - r15), [sp]+		@ get calling r0 - r15
-	ldur	(r16 - lr), [sp]+		@ get calling r16 - lr
-	.endif
-	nop
-	add	sp, sp, #S_FRAME_SIZE - S_R16
-	mov.a	pc, lr				@ return
-						@ and move bsr_priv into asr
-	.endm
-
-	.macro	get_thread_info, rd
-	mov	\rd, sp >> #13
-	mov	\rd, \rd << #13
-	.endm
-
-	.macro	get_irqnr_and_base, irqnr, irqstat, base, tmp
-	ldw	\base, =(PKUNITY_INTC_BASE)
-	ldw	\irqstat, [\base+], #0xC	@ INTC_ICIP
-	ldw	\tmp,	  [\base+], #0x4	@ INTC_ICMR
-	and.a	\irqstat, \irqstat, \tmp
-	beq	1001f
-	cntlz	\irqnr, \irqstat
-	rsub	\irqnr, \irqnr, #31
-1001:	/* EQ will be set if no irqs pending */
-	.endm
-
-#ifdef CONFIG_DEBUG_LL
-	.macro	printreg, reg, temp
-		adr	\temp, 901f
-		stm	(r0-r3), [\temp]+
-		stw	lr, [\temp+], #0x10
-		mov	r0, \reg
-		b.l	printhex8
-		mov	r0, #':'
-		b.l	printch
-		mov	r0, pc
-		b.l	printhex8
-		adr	r0, 902f
-		b.l	printascii
-		adr	\temp, 901f
-		ldm	(r0-r3), [\temp]+
-		ldw	lr, [\temp+], #0x10
-		b	903f
-901:	.word	0, 0, 0, 0, 0	@ r0-r3, lr
-902:	.asciz	": epip4d\n"
-	.align
-903:
-	.endm
-#endif
-
-/*
- * These are the registers used in the syscall handler, and allow us to
- * have in theory up to 7 arguments to a function - r0 to r6.
- *
- * Note that tbl == why is intentional.
- *
- * We must set at least "tsk" and "why" when calling ret_with_reschedule.
- */
-scno	.req	r21		@ syscall number
-tbl	.req	r22		@ syscall table pointer
-why	.req	r22		@ Linux syscall (!= 0)
-tsk	.req	r23		@ current thread_info
-
-/*
- * Interrupt handling.  Preserves r17, r18, r19
- */
-	.macro	intr_handler
-1:	get_irqnr_and_base r0, r6, r5, lr
-	beq	2f
-	mov	r1, sp
-	@
-	@ routine called with r0 = irq number, r1 = struct pt_regs *
-	@
-	adr	lr, 1b
-	b	asm_do_IRQ
-2:
-	.endm
-
-/*
- * PRIV mode handlers
- */
-	.macro	priv_entry
-	sub	sp, sp, #(S_FRAME_SIZE - 4)
-	stm	(r1 - r15), [sp]+
-	add	r5, sp, #S_R15
-	stm	(r16 - r28), [r5]+
-
-	ldm	(r1 - r3), [r0]+
-	add	r5, sp, #S_SP - 4	@ here for interlock avoidance
-	mov	r4, #-1			@  ""  ""      ""       ""
-	add	r0, sp, #(S_FRAME_SIZE - 4)
-	stw.w	r1, [sp+], #-4		@ save the "real" r0 copied
-					@ from the exception stack
-
-	mov	r1, lr
-
-	@
-	@ We are now ready to fill in the remaining blanks on the stack:
-	@
-	@  r0 - sp_priv
-	@  r1 - lr_priv
-	@  r2 - lr_<exception>, already fixed up for correct return/restart
-	@  r3 - bsr_<exception>
-	@  r4 - orig_r0 (see pt_regs definition in ptrace.h)
-	@
-	stm	(r0 - r4), [r5]+
-	.endm
-
-/*
- * User mode handlers
- *
- */
-	.macro	user_entry
-	sub	sp, sp, #S_FRAME_SIZE
-	stm	(r1 - r15), [sp+]
-	add	r4, sp, #S_R16
-	stm	(r16 - r28), [r4]+
-
-	ldm	(r1 - r3), [r0]+
-	add	r0, sp, #S_PC		@ here for interlock avoidance
-	mov	r4, #-1			@  ""  ""     ""        ""
-
-	stw	r1, [sp]		@ save the "real" r0 copied
-					@ from the exception stack
-
-	@
-	@ We are now ready to fill in the remaining blanks on the stack:
-	@
-	@  r2 - lr_<exception>, already fixed up for correct return/restart
-	@  r3 - bsr_<exception>
-	@  r4 - orig_r0 (see pt_regs definition in ptrace.h)
-	@
-	@ Also, separately save sp_user and lr_user
-	@
-	stm	(r2 - r4), [r0]+
-	stur	(sp, lr), [r0-]
-
-	@
-	@ Enable the alignment trap while in kernel mode
-	@
-	alignment_trap r0
-
-	@
-	@ Clear FP to mark the first stack frame
-	@
-	zero_fp
-	.endm
-
-	.text
-
-@
-@ __invalid - generic code for failed exception
-@			(re-entrant version of handlers)
-@
-__invalid:
-	sub	sp, sp, #S_FRAME_SIZE
-	stm	(r1 - r15), [sp+]
-	add	r1, sp, #S_R16
-	stm	(r16 - r28, sp, lr), [r1]+
-
-	zero_fp
-
-	ldm	(r4 - r6), [r0]+
-	add	r0, sp, #S_PC		@ here for interlock avoidance
-	mov	r7, #-1			@  ""   ""    ""        ""
-	stw	r4, [sp]		@ save preserved r0
-	stm	(r5 - r7), [r0]+	@ lr_<exception>,
-					@ asr_<exception>, "old_r0"
-
-	mov	r0, sp
-	mov	r1, asr
-	b	bad_mode
-ENDPROC(__invalid)
-
-	.align	5
-__dabt_priv:
-	priv_entry
-
-	@
-	@ get ready to re-enable interrupts if appropriate
-	@
-	mov	r17, asr
-	cand.a	r3, #PSR_I_BIT
-	bne	1f
-	andn	r17, r17, #PSR_I_BIT
-1:
-
-	@
-	@ Call the processor-specific abort handler:
-	@
-	@  r2 - aborted context pc
-	@  r3 - aborted context asr
-	@
-	@ The abort handler must return the aborted address in r0, and
-	@ the fault status register in r1.
-	@
-	movc	r1, p0.c3, #0		@ get FSR
-	movc	r0, p0.c4, #0		@ get FAR
-
-	@
-	@ set desired INTR state, then call main handler
-	@
-	mov.a	asr, r17
-	mov	r2, sp
-	b.l	do_DataAbort
-
-	@
-	@ INTRs off again before pulling preserved data off the stack
-	@
-	disable_irq r0
-
-	@
-	@ restore BSR and restart the instruction
-	@
-	ldw	r2, [sp+], #S_PSR
-	priv_exit r2				@ return from exception
-ENDPROC(__dabt_priv)
-
-	.align	5
-__intr_priv:
-	priv_entry
-
-	intr_handler
-
-	mov	r0, #0				@ epip4d
-	movc	p0.c5, r0, #14
-	nop; nop; nop; nop; nop; nop; nop; nop
-
-	ldw	r4, [sp+], #S_PSR		@ irqs are already disabled
-
-	priv_exit r4				@ return from exception
-ENDPROC(__intr_priv)
-
-	.ltorg
-
-	.align	5
-__extn_priv:
-	priv_entry
-
-	mov	r0, sp				@ struct pt_regs *regs
-	mov	r1, asr
-	b	bad_mode			@ not supported
-ENDPROC(__extn_priv)
-
-	.align	5
-__pabt_priv:
-	priv_entry
-
-	@
-	@ re-enable interrupts if appropriate
-	@
-	mov	r17, asr
-	cand.a	r3, #PSR_I_BIT
-	bne	1f
-	andn	r17, r17, #PSR_I_BIT
-1:
-
-	@
-	@ set args, then call main handler
-	@
-	@  r0 - address of faulting instruction
-	@  r1 - pointer to registers on stack
-	@
-	mov	r0, r2			@ pass address of aborted instruction
-	mov	r1, #5
-	mov.a	asr, r17
-	mov	r2, sp			@ regs
-	b.l	do_PrefetchAbort	@ call abort handler
-
-	@
-	@ INTRs off again before pulling preserved data off the stack
-	@
-	disable_irq r0
-
-	@
-	@ restore BSR and restart the instruction
-	@
-	ldw	r2, [sp+], #S_PSR
-	priv_exit r2			@ return from exception
-ENDPROC(__pabt_priv)
-
-	.align	5
-.LCcralign:
-	.word	cr_alignment
-
-	.align	5
-__dabt_user:
-	user_entry
-
-#ifdef CONFIG_UNICORE_FPU_F64
-	cff	ip, s31
-	cand.a	ip, #0x08000000		@ FPU execption traps?
-	beq	209f
-
-	ldw	ip, [sp+], #S_PC
-	add	ip, ip, #4
-	stw	ip, [sp+], #S_PC
-	@
-	@ fall through to the emulation code, which returns using r19 if
-	@ it has emulated the instruction, or the more conventional lr
-	@ if we are to treat this as a real extended instruction
-	@
-	@  r0 - instruction
-	@
-1:	ldw.u	r0, [r2]
-	adr	r19, ret_from_exception
-	adr	lr, 209f
-	@
-	@ fallthrough to call do_uc_f64
-	@
-/*
- * Check whether the instruction is a co-processor instruction.
- * If yes, we need to call the relevant co-processor handler.
- *
- * Note that we don't do a full check here for the co-processor
- * instructions; all instructions with bit 27 set are well
- * defined.  The only instructions that should fault are the
- * co-processor instructions.
- *
- * Emulators may wish to make use of the following registers:
- *  r0  = instruction opcode.
- *  r2  = PC
- *  r19 = normal "successful" return address
- *  r20 = this threads thread_info structure.
- *  lr  = unrecognised instruction return address
- */
-	get_thread_info r20			@ get current thread
-	and	r8, r0, #0x00003c00		@ mask out CP number
-	mov	r7, #1
-	stb	r7, [r20+], #TI_USED_CP + 2	@ set appropriate used_cp[]
-
-	@ F64 hardware support entry point.
-	@  r0  = faulted instruction
-	@  r19 = return address
-	@  r20 = fp_state
-	enable_irq r4
-	add	r20, r20, #TI_FPSTATE	@ r20 = workspace
-	cff	r1, s31			@ get fpu FPSCR
-	andn    r2, r1, #0x08000000
-	ctf     r2, s31			@ clear 27 bit
-	mov	r2, sp			@ nothing stacked - regdump is at TOS
-	mov	lr, r19			@ setup for a return to the user code
-
-	@ Now call the C code to package up the bounce to the support code
-	@   r0 holds the trigger instruction
-	@   r1 holds the FPSCR value
-	@   r2 pointer to register dump
-	b	ucf64_exchandler
-209:
-#endif
-	@
-	@ Call the processor-specific abort handler:
-	@
-	@  r2 - aborted context pc
-	@  r3 - aborted context asr
-	@
-	@ The abort handler must return the aborted address in r0, and
-	@ the fault status register in r1.
-	@
-	movc	r1, p0.c3, #0		@ get FSR
-	movc	r0, p0.c4, #0		@ get FAR
-
-	@
-	@ INTRs on, then call the main handler
-	@
-	enable_irq r2
-	mov	r2, sp
-	adr	lr, ret_from_exception
-	b	do_DataAbort
-ENDPROC(__dabt_user)
-
-	.align	5
-__intr_user:
-	user_entry
-
-	get_thread_info tsk
-
-	intr_handler
-
-	mov	why, #0
-	b	ret_to_user
-ENDPROC(__intr_user)
-
-	.ltorg
-
-	.align	5
-__extn_user:
-	user_entry
-
-	mov	r0, sp
-	mov	r1, asr
-	b	bad_mode
-ENDPROC(__extn_user)
-
-	.align	5
-__pabt_user:
-	user_entry
-
-	mov	r0, r2			@ pass address of aborted instruction.
-	mov	r1, #5
-	enable_irq r1			@ Enable interrupts
-	mov	r2, sp			@ regs
-	b.l	do_PrefetchAbort	@ call abort handler
-	/* fall through */
-/*
- * This is the return code to user mode for abort handlers
- */
-ENTRY(ret_from_exception)
-	get_thread_info tsk
-	mov	why, #0
-	b	ret_to_user
-ENDPROC(__pabt_user)
-ENDPROC(ret_from_exception)
-
-/*
- * Register switch for UniCore V2 processors
- * r0 = previous task_struct, r1 = previous thread_info, r2 = next thread_info
- * previous and next are guaranteed not to be the same.
- */
-ENTRY(__switch_to)
-	add	ip, r1, #TI_CPU_SAVE
-	stm.w	(r4 - r15), [ip]+
-	stm.w	(r16 - r27, sp, lr), [ip]+
-
-#ifdef	CONFIG_UNICORE_FPU_F64
-	add	ip, r1, #TI_FPSTATE
-	sfm.w	(f0  - f7 ), [ip]+
-	sfm.w	(f8  - f15), [ip]+
-	sfm.w	(f16 - f23), [ip]+
-	sfm.w	(f24 - f31), [ip]+
-	cff	r4, s31
-	stw	r4, [ip]
-
-	add	ip, r2, #TI_FPSTATE
-	lfm.w	(f0  - f7 ), [ip]+
-	lfm.w	(f8  - f15), [ip]+
-	lfm.w	(f16 - f23), [ip]+
-	lfm.w	(f24 - f31), [ip]+
-	ldw	r4, [ip]
-	ctf	r4, s31
-#endif
-	add	ip, r2, #TI_CPU_SAVE
-	ldm.w	(r4 - r15), [ip]+
-	ldm	(r16 - r27, sp, pc), [ip]+	@ Load all regs saved previously
-ENDPROC(__switch_to)
-
-	.align	5
-/*
- * This is the fast syscall return path.  We do as little as
- * possible here, and this includes saving r0 back into the PRIV
- * stack.
- */
-ret_fast_syscall:
-	disable_irq r1				@ disable interrupts
-	ldw	r1, [tsk+], #TI_FLAGS
-	cand.a	r1, #_TIF_WORK_MASK
-	bne	fast_work_pending
-
-	@ fast_restore_user_regs
-	restore_user_regs fast = 1, offset = S_OFF
-
-/*
- * Ok, we need to do extra processing, enter the slow path.
- */
-fast_work_pending:
-	stw.w	r0, [sp+], #S_R0+S_OFF		@ returned r0
-work_pending:
-	cand.a	r1, #_TIF_NEED_RESCHED
-	bne	work_resched
-	mov	r0, sp				@ 'regs'
-	mov	r2, why				@ 'syscall'
-	cand.a	r1, #_TIF_SIGPENDING		@ delivering a signal?
-	cmovne	why, #0				@ prevent further restarts
-	b.l	do_notify_resume
-	b	ret_slow_syscall		@ Check work again
-
-work_resched:
-	b.l	schedule
-/*
- * "slow" syscall return path.  "why" tells us if this was a real syscall.
- */
-ENTRY(ret_to_user)
-ret_slow_syscall:
-	disable_irq r1				@ disable interrupts
-	get_thread_info tsk			@ epip4d, one path error?!
-	ldw	r1, [tsk+], #TI_FLAGS
-	cand.a	r1, #_TIF_WORK_MASK
-	bne	work_pending
-no_work_pending:
-	@ slow_restore_user_regs
-	restore_user_regs fast = 0, offset = 0
-ENDPROC(ret_to_user)
-
-/*
- * This is how we return from a fork.
- */
-ENTRY(ret_from_fork)
-	b.l	schedule_tail
-	b	ret_slow_syscall
-ENDPROC(ret_from_fork)
-
-ENTRY(ret_from_kernel_thread)
-	b.l	schedule_tail
-	mov	r0, r5
-	adr	lr, ret_slow_syscall
-	mov	pc, r4
-ENDPROC(ret_from_kernel_thread)
-
-/*=============================================================================
- * SWI handler
- *-----------------------------------------------------------------------------
- */
-	.align	5
-ENTRY(vector_swi)
-	sub	sp, sp, #S_FRAME_SIZE
-	stm	(r0 - r15), [sp]+		@ Calling r0 - r15
-	add	r8, sp, #S_R16
-	stm	(r16 - r28), [r8]+		@ Calling r16 - r28
-	add	r8, sp, #S_PC
-	stur	(sp, lr), [r8-]			@ Calling sp, lr
-	mov	r8, bsr				@ called from non-REAL mode
-	stw	lr, [sp+], #S_PC		@ Save calling PC
-	stw	r8, [sp+], #S_PSR		@ Save ASR
-	stw	r0, [sp+], #S_OLD_R0		@ Save OLD_R0
-	zero_fp
-
-	/*
-	 * Get the system call number.
-	 */
-	sub	ip, lr, #4
-	ldw.u	scno, [ip]			@ get SWI instruction
-
-#ifdef CONFIG_ALIGNMENT_TRAP
-	ldw	ip, __cr_alignment
-	ldw	ip, [ip]
-	movc	p0.c1, ip, #0                   @ update control register
-#endif
-	enable_irq ip
-
-	get_thread_info tsk
-	ldw	tbl, =sys_call_table		@ load syscall table pointer
-
-	andn	scno, scno, #0xff000000		@ mask off SWI op-code
-	andn	scno, scno, #0x00ff0000		@ mask off SWI op-code
-
-	stm.w	(r4, r5), [sp-]			@ push fifth and sixth args
-	ldw	ip, [tsk+], #TI_FLAGS		@ check for syscall tracing
-	cand.a	ip, #_TIF_SYSCALL_TRACE		@ are we tracing syscalls?
-	bne	__sys_trace
-
-	csub.a	scno, #__NR_syscalls		@ check upper syscall limit
-	adr	lr, ret_fast_syscall		@ return address
-	bea	1f
-	ldw	pc, [tbl+], scno << #2		@ call sys_* routine
-1:
-	add	r1, sp, #S_OFF
-2:	mov	why, #0				@ no longer a real syscall
-	b	sys_ni_syscall			@ not private func
-
-	/*
-	 * This is the really slow path.  We're going to be doing
-	 * context switches, and waiting for our parent to respond.
-	 */
-__sys_trace:
-	mov	r2, scno
-	add	r1, sp, #S_OFF
-	mov	r0, #0				@ trace entry [IP = 0]
-	b.l	syscall_trace
-
-	adr	lr, __sys_trace_return		@ return address
-	mov	scno, r0			@ syscall number (possibly new)
-	add	r1, sp, #S_R0 + S_OFF		@ pointer to regs
-	csub.a	scno, #__NR_syscalls		@ check upper syscall limit
-	bea	2b
-	ldm	(r0 - r3), [r1]+		@ have to reload r0 - r3
-	ldw	pc, [tbl+], scno << #2		@ call sys_* routine
-
-__sys_trace_return:
-	stw.w	r0, [sp+], #S_R0 + S_OFF	@ save returned r0
-	mov	r2, scno
-	mov	r1, sp
-	mov	r0, #1				@ trace exit [IP = 1]
-	b.l	syscall_trace
-	b	ret_slow_syscall
-
-	.align	5
-#ifdef CONFIG_ALIGNMENT_TRAP
-	.type	__cr_alignment, #object
-__cr_alignment:
-	.word	cr_alignment
-#endif
-	.ltorg
-
-ENTRY(sys_rt_sigreturn)
-		add	r0, sp, #S_OFF
-		mov	why, #0		@ prevent syscall restart handling
-		b	__sys_rt_sigreturn
-ENDPROC(sys_rt_sigreturn)
-
-	__INIT
-
-/*
- * Vector stubs.
- *
- * This code is copied to 0xffff0200 so we can use branches in the
- * vectors, rather than ldr's.  Note that this code must not
- * exceed 0x300 bytes.
- *
- * Common stub entry macro:
- *   Enter in INTR mode, bsr = PRIV/USER ASR, lr = PRIV/USER PC
- *
- * SP points to a minimal amount of processor-private memory, the address
- * of which is copied into r0 for the mode specific abort handler.
- */
-	.macro	vector_stub, name, mode
-	.align	5
-
-vector_\name:
-	@
-	@ Save r0, lr_<exception> (parent PC) and bsr_<exception>
-	@ (parent ASR)
-	@
-	stw	r0, [sp]
-	stw	lr, [sp+], #4		@ save r0, lr
-	mov	lr, bsr
-	stw	lr, [sp+], #8		@ save bsr
-
-	@
-	@ Prepare for PRIV mode.  INTRs remain disabled.
-	@
-	mov	r0, asr
-	xor	r0, r0, #(\mode ^ PRIV_MODE)
-	mov.a	bsr, r0
-
-	@
-	@ the branch table must immediately follow this code
-	@
-	and	lr, lr, #0x03
-	add	lr, lr, #1
-	mov	r0, sp
-	ldw	lr, [pc+], lr << #2
-	mov.a	pc, lr			@ branch to handler in PRIV mode
-ENDPROC(vector_\name)
-	.align	2
-	@ handler addresses follow this label
-	.endm
-
-	.globl	__stubs_start
-__stubs_start:
-/*
- * Interrupt dispatcher
- */
-	vector_stub	intr, INTR_MODE
-
-	.long	__intr_user			@  0  (USER)
-	.long	__invalid			@  1
-	.long	__invalid			@  2
-	.long	__intr_priv			@  3  (PRIV)
-
-/*
- * Data abort dispatcher
- * Enter in ABT mode, bsr = USER ASR, lr = USER PC
- */
-	vector_stub	dabt, ABRT_MODE
-
-	.long	__dabt_user			@  0  (USER)
-	.long	__invalid			@  1
-	.long	__invalid			@  2  (INTR)
-	.long	__dabt_priv			@  3  (PRIV)
-
-/*
- * Prefetch abort dispatcher
- * Enter in ABT mode, bsr = USER ASR, lr = USER PC
- */
-	vector_stub	pabt, ABRT_MODE
-
-	.long	__pabt_user			@  0 (USER)
-	.long	__invalid			@  1
-	.long	__invalid			@  2 (INTR)
-	.long	__pabt_priv			@  3 (PRIV)
-
-/*
- * Undef instr entry dispatcher
- * Enter in EXTN mode, bsr = PRIV/USER ASR, lr = PRIV/USER PC
- */
-	vector_stub	extn, EXTN_MODE
-
-	.long	__extn_user			@  0 (USER)
-	.long	__invalid			@  1
-	.long	__invalid			@  2 (INTR)
-	.long	__extn_priv			@  3 (PRIV)
-
-/*
- * We group all the following data together to optimise
- * for CPUs with separate I & D caches.
- */
-	.align	5
-
-.LCvswi:
-	.word	vector_swi
-
-	.globl	__stubs_end
-__stubs_end:
-
-	.equ	stubs_offset, __vectors_start + 0x200 - __stubs_start
-
-	.globl	__vectors_start
-__vectors_start:
-	jepriv	SYS_ERROR0
-	b	vector_extn + stubs_offset
-	ldw	pc, .LCvswi + stubs_offset
-	b	vector_pabt + stubs_offset
-	b	vector_dabt + stubs_offset
-	jepriv	SYS_ERROR0
-	b	vector_intr + stubs_offset
-	jepriv	SYS_ERROR0
-
-	.globl	__vectors_end
-__vectors_end:
-
-	.data
-
-	.globl	cr_alignment
-	.globl	cr_no_alignment
-cr_alignment:
-	.space	4
-cr_no_alignment:
-	.space	4
diff --git a/arch/unicore32/kernel/fpu-ucf64.c b/arch/unicore32/kernel/fpu-ucf64.c
deleted file mode 100644
index 85f0af29d29b..000000000000
--- a/arch/unicore32/kernel/fpu-ucf64.c
+++ /dev/null
@@ -1,117 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * linux/arch/unicore32/kernel/fpu-ucf64.c
- *
- * Code specific to PKUnity SoC and UniCore ISA
- *
- * Copyright (C) 2001-2010 GUAN Xue-tao
- */
-#include <linux/module.h>
-#include <linux/types.h>
-#include <linux/kernel.h>
-#include <linux/signal.h>
-#include <linux/sched/signal.h>
-#include <linux/init.h>
-
-#include <asm/fpu-ucf64.h>
-
-/*
- * A special flag to tell the normalisation code not to normalise.
- */
-#define F64_NAN_FLAG	0x100
-
-/*
- * A bit pattern used to indicate the initial (unset) value of the
- * exception mask, in case nothing handles an instruction.  This
- * doesn't include the NAN flag, which get masked out before
- * we check for an error.
- */
-#define F64_EXCEPTION_ERROR	((u32)-1 & ~F64_NAN_FLAG)
-
-/*
- * Since we aren't building with -mfpu=f64, we need to code
- * these instructions using their MRC/MCR equivalents.
- */
-#define f64reg(_f64_) #_f64_
-
-#define cff(_f64_) ({			\
-	u32 __v;			\
-	asm("cff %0, " f64reg(_f64_) "@ fmrx	%0, " #_f64_	\
-	    : "=r" (__v) : : "cc");	\
-	__v;				\
-	})
-
-#define ctf(_f64_, _var_)		\
-	asm("ctf %0, " f64reg(_f64_) "@ fmxr	" #_f64_ ", %0"	\
-	   : : "r" (_var_) : "cc")
-
-/*
- * Raise a SIGFPE for the current process.
- * sicode describes the signal being raised.
- */
-void ucf64_raise_sigfpe(struct pt_regs *regs)
-{
-	/*
-	 * This is the same as NWFPE, because it's not clear what
-	 * this is used for
-	 */
-	current->thread.error_code = 0;
-	current->thread.trap_no = 6;
-
-	send_sig_fault(SIGFPE, FPE_FLTUNK,
-		       (void __user *)(instruction_pointer(regs) - 4),
-		       current);
-}
-
-/*
- * Handle exceptions of UniCore-F64.
- */
-void ucf64_exchandler(u32 inst, u32 fpexc, struct pt_regs *regs)
-{
-	u32 tmp = fpexc;
-	u32 exc = F64_EXCEPTION_ERROR & fpexc;
-
-	pr_debug("UniCore-F64: instruction %08x fpscr %08x\n",
-			inst, fpexc);
-
-	if (exc & FPSCR_CMPINSTR_BIT) {
-		if (exc & FPSCR_CON)
-			tmp |= FPSCR_CON;
-		else
-			tmp &= ~(FPSCR_CON);
-		exc &= ~(FPSCR_CMPINSTR_BIT | FPSCR_CON);
-	} else {
-		pr_debug("UniCore-F64 Error: unhandled exceptions\n");
-		pr_debug("UniCore-F64 FPSCR 0x%08x INST 0x%08x\n",
-				cff(FPSCR), inst);
-
-		ucf64_raise_sigfpe(regs);
-		return;
-	}
-
-	/*
-	 * Update the FPSCR with the additional exception flags.
-	 * Comparison instructions always return at least one of
-	 * these flags set.
-	 */
-	tmp &= ~(FPSCR_TRAP | FPSCR_IOS | FPSCR_OFS | FPSCR_UFS |
-			FPSCR_IXS | FPSCR_HIS | FPSCR_IOC | FPSCR_OFC |
-			FPSCR_UFC | FPSCR_IXC | FPSCR_HIC);
-
-	tmp |= exc;
-	ctf(FPSCR, tmp);
-}
-
-/*
- * F64 support code initialisation.
- */
-static int __init ucf64_init(void)
-{
-	ctf(FPSCR, 0x0);     /* FPSCR_UFE | FPSCR_NDE perhaps better */
-
-	printk(KERN_INFO "Enable UniCore-F64 support.\n");
-
-	return 0;
-}
-
-late_initcall(ucf64_init);
diff --git a/arch/unicore32/kernel/gpio.c b/arch/unicore32/kernel/gpio.c
deleted file mode 100644
index 36d395b54b7c..000000000000
--- a/arch/unicore32/kernel/gpio.c
+++ /dev/null
@@ -1,121 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * linux/arch/unicore32/kernel/gpio.c
- *
- * Code specific to PKUnity SoC and UniCore ISA
- *
- *	Maintained by GUAN Xue-tao <gxt@mprc.pku.edu.cn>
- *	Copyright (C) 2001-2010 Guan Xuetao
- */
-/* in FPGA, no GPIO support */
-
-#include <linux/init.h>
-#include <linux/module.h>
-#include <linux/gpio/driver.h>
-/* FIXME: needed for gpio_set_value() - convert to use descriptors or hogs */
-#include <linux/gpio.h>
-#include <mach/hardware.h>
-
-#ifdef CONFIG_LEDS
-#include <linux/leds.h>
-#include <linux/platform_device.h>
-
-static const struct gpio_led puv3_gpio_leds[] = {
-	{ .name = "cpuhealth", .gpio = GPO_CPU_HEALTH, .active_low = 0,
-		.default_trigger = "heartbeat",	},
-	{ .name = "hdd_led", .gpio = GPO_HDD_LED, .active_low = 1,
-		.default_trigger = "disk-activity", },
-};
-
-static const struct gpio_led_platform_data puv3_gpio_led_data = {
-	.num_leds =	ARRAY_SIZE(puv3_gpio_leds),
-	.leds =		(void *) puv3_gpio_leds,
-};
-
-static struct platform_device puv3_gpio_gpio_leds = {
-	.name =		"leds-gpio",
-	.id =		-1,
-	.dev = {
-		.platform_data = (void *) &puv3_gpio_led_data,
-	}
-};
-
-static int __init puv3_gpio_leds_init(void)
-{
-	platform_device_register(&puv3_gpio_gpio_leds);
-	return 0;
-}
-
-device_initcall(puv3_gpio_leds_init);
-#endif
-
-static int puv3_gpio_get(struct gpio_chip *chip, unsigned offset)
-{
-	return !!(readl(GPIO_GPLR) & GPIO_GPIO(offset));
-}
-
-static void puv3_gpio_set(struct gpio_chip *chip, unsigned offset, int value)
-{
-	if (value)
-		writel(GPIO_GPIO(offset), GPIO_GPSR);
-	else
-		writel(GPIO_GPIO(offset), GPIO_GPCR);
-}
-
-static int puv3_direction_input(struct gpio_chip *chip, unsigned offset)
-{
-	unsigned long flags;
-
-	local_irq_save(flags);
-	writel(readl(GPIO_GPDR) & ~GPIO_GPIO(offset), GPIO_GPDR);
-	local_irq_restore(flags);
-	return 0;
-}
-
-static int puv3_direction_output(struct gpio_chip *chip, unsigned offset,
-		int value)
-{
-	unsigned long flags;
-
-	local_irq_save(flags);
-	puv3_gpio_set(chip, offset, value);
-	writel(readl(GPIO_GPDR) | GPIO_GPIO(offset), GPIO_GPDR);
-	local_irq_restore(flags);
-	return 0;
-}
-
-static struct gpio_chip puv3_gpio_chip = {
-	.label			= "gpio",
-	.direction_input	= puv3_direction_input,
-	.direction_output	= puv3_direction_output,
-	.set			= puv3_gpio_set,
-	.get			= puv3_gpio_get,
-	.base			= 0,
-	.ngpio			= GPIO_MAX + 1,
-};
-
-void __init puv3_init_gpio(void)
-{
-	writel(GPIO_DIR, GPIO_GPDR);
-#if	defined(CONFIG_PUV3_NB0916) || defined(CONFIG_PUV3_SMW0919)	\
-	|| defined(CONFIG_PUV3_DB0913)
-	gpio_set_value(GPO_WIFI_EN, 1);
-	gpio_set_value(GPO_HDD_LED, 1);
-	gpio_set_value(GPO_VGA_EN, 1);
-	gpio_set_value(GPO_LCD_EN, 1);
-	gpio_set_value(GPO_CAM_PWR_EN, 0);
-	gpio_set_value(GPO_LCD_VCC_EN, 1);
-	gpio_set_value(GPO_SOFT_OFF, 1);
-	gpio_set_value(GPO_BT_EN, 1);
-	gpio_set_value(GPO_FAN_ON, 0);
-	gpio_set_value(GPO_SPKR, 0);
-	gpio_set_value(GPO_CPU_HEALTH, 1);
-	gpio_set_value(GPO_LAN_SEL, 1);
-/*
- * DO NOT modify the GPO_SET_V1 and GPO_SET_V2 in kernel
- *	gpio_set_value(GPO_SET_V1, 1);
- *	gpio_set_value(GPO_SET_V2, 1);
- */
-#endif
-	gpiochip_add_data(&puv3_gpio_chip, NULL);
-}
diff --git a/arch/unicore32/kernel/head.S b/arch/unicore32/kernel/head.S
deleted file mode 100644
index 9bbb8668f9f7..000000000000
--- a/arch/unicore32/kernel/head.S
+++ /dev/null
@@ -1,249 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/*
- * linux/arch/unicore32/kernel/head.S
- *
- * Code specific to PKUnity SoC and UniCore ISA
- *
- * Copyright (C) 2001-2010 GUAN Xue-tao
- */
-#include <linux/linkage.h>
-#include <linux/init.h>
-
-#include <asm/assembler.h>
-#include <asm/ptrace.h>
-#include <generated/asm-offsets.h>
-#include <asm/memory.h>
-#include <asm/thread_info.h>
-#include <asm/hwdef-copro.h>
-#include <asm/pgtable-hwdef.h>
-
-#if (PHYS_OFFSET & 0x003fffff)
-#error "PHYS_OFFSET must be at an even 4MiB boundary!"
-#endif
-
-#define KERNEL_RAM_VADDR	(PAGE_OFFSET + KERNEL_IMAGE_START)
-#define KERNEL_RAM_PADDR	(PHYS_OFFSET + KERNEL_IMAGE_START)
-
-#define KERNEL_PGD_PADDR	(KERNEL_RAM_PADDR - 0x1000)
-#define KERNEL_PGD_VADDR	(KERNEL_RAM_VADDR - 0x1000)
-
-#define KERNEL_START		KERNEL_RAM_VADDR
-#define KERNEL_END		_end
-
-/*
- * swapper_pg_dir is the virtual address of the initial page table.
- * We place the page tables 4K below KERNEL_RAM_VADDR.  Therefore, we must
- * make sure that KERNEL_RAM_VADDR is correctly set.  Currently, we expect
- * the least significant 16 bits to be 0x8000, but we could probably
- * relax this restriction to KERNEL_RAM_VADDR >= PAGE_OFFSET + 0x1000.
- */
-#if (KERNEL_RAM_VADDR & 0xffff) != 0x8000
-#error KERNEL_RAM_VADDR must start at 0xXXXX8000
-#endif
-
-	.globl	swapper_pg_dir
-	.equ	swapper_pg_dir, KERNEL_RAM_VADDR - 0x1000
-
-/*
- * Kernel startup entry point.
- * ---------------------------
- *
- * This is normally called from the decompressor code.  The requirements
- * are: MMU = off, D-cache = off, I-cache = dont care
- *
- * This code is mostly position independent, so if you link the kernel at
- * 0xc0008000, you call this at __pa(0xc0008000).
- */
-	__HEAD
-ENTRY(stext)
-	@ set asr
-	mov	r0, #PRIV_MODE			@ ensure priv mode
-	or	r0, #PSR_R_BIT | PSR_I_BIT	@ disable irqs
-	mov.a	asr, r0
-
-	@ process identify
-	movc	r0, p0.c0, #0			@ cpuid
-	movl	r1, 0xff00ffff			@ mask
-	movl	r2, 0x4d000863			@ value
-	and	r0, r1, r0
-	cxor.a	r0, r2
-	bne	__error_p			@ invalid processor id
-
-	/*
-	 * Clear the 4K level 1 swapper page table
-	 */
-	movl	r0, #KERNEL_PGD_PADDR		@ page table address
-	mov	r1, #0
-	add	r2, r0, #0x1000
-101:	stw.w	r1, [r0]+, #4
-	stw.w	r1, [r0]+, #4
-	stw.w	r1, [r0]+, #4
-	stw.w	r1, [r0]+, #4
-	cxor.a	r0, r2
-	bne	101b
-
-	movl	r4, #KERNEL_PGD_PADDR		@ page table address
-	mov	r7, #PMD_TYPE_SECT | PMD_PRESENT	@ page size: section
-	or	r7, r7, #PMD_SECT_CACHEABLE		@ cacheable
-	or	r7, r7, #PMD_SECT_READ | PMD_SECT_WRITE | PMD_SECT_EXEC
-
-	/*
-	 * Create identity mapping for first 4MB of kernel to
-	 * cater for the MMU enable.  This identity mapping
-	 * will be removed by paging_init().  We use our current program
-	 * counter to determine corresponding section base address.
-	 */
-	mov	r6, pc
-	mov	r6, r6 >> #22			@ start of kernel section
-	or	r1, r7, r6 << #22		@ flags + kernel base
-	stw	r1, [r4+], r6 << #2		@ identity mapping
-
-	/*
-	 * Now setup the pagetables for our kernel direct
-	 * mapped region.
-	 */
-	add	r0, r4,  #(KERNEL_START & 0xff000000) >> 20
-	stw.w	r1, [r0+], #(KERNEL_START & 0x00c00000) >> 20
-	movl	r6, #(KERNEL_END - 1)
-	add	r0, r0, #4
-	add	r6, r4, r6 >> #20
-102:	csub.a	r0, r6
-	add	r1, r1, #1 << 22
-	bua	103f
-	stw.w	r1, [r0]+, #4
-	b	102b
-103:
-	/*
-	 * Then map first 4MB of ram in case it contains our boot params.
-	 */
-	add	r0, r4, #PAGE_OFFSET >> 20
-	or	r6, r7, #(PHYS_OFFSET & 0xffc00000)
-	stw	r6, [r0]
-
-	ldw	r15, __switch_data		@ address to jump to after
-
-	/*
-	 * Initialise TLB, Caches, and MMU state ready to switch the MMU
-	 * on.
-	 */
-	mov	r0, #0
-	movc	p0.c5, r0, #28			@ cache invalidate all
-	nop8
-	movc	p0.c6, r0, #6			@ TLB invalidate all
-	nop8
-
-	/*
-	 * ..V. .... ..TB IDAM
-	 * ..1. .... ..01 1111
-	 */
-	movl	r0, #0x201f			@ control register setting
-
-	/*
-	 * Setup common bits before finally enabling the MMU.  Essentially
-	 * this is just loading the page table pointer and domain access
-	 * registers.
-	 */
-	#ifndef CONFIG_ALIGNMENT_TRAP
-		andn	r0, r0, #CR_A
-	#endif
-	#ifdef CONFIG_CPU_DCACHE_DISABLE
-		andn	r0, r0, #CR_D
-	#endif
-	#ifdef CONFIG_CPU_DCACHE_WRITETHROUGH
-		andn	r0, r0, #CR_B
-	#endif
-	#ifdef CONFIG_CPU_ICACHE_DISABLE
-		andn	r0, r0, #CR_I
-	#endif
-
-	movc	p0.c2, r4, #0			@ set pgd
-	b	__turn_mmu_on
-ENDPROC(stext)
-
-/*
- * Enable the MMU.  This completely changes the structure of the visible
- * memory space.  You will not be able to trace execution through this.
- *
- *  r0  = cp#0 control register
- *  r15 = *virtual* address to jump to upon completion
- */
-	.align	5
-__turn_mmu_on:
-	mov	r0, r0
-	movc	p0.c1, r0, #0			@ write control reg
-	nop					@ fetch inst by phys addr
-	mov	pc, r15
-	nop8					@ fetch inst by phys addr
-ENDPROC(__turn_mmu_on)
-
-/*
- * Setup the initial page tables.  We only setup the barest
- * amount which are required to get the kernel running, which
- * generally means mapping in the kernel code.
- *
- * r9  = cpuid
- * r10 = procinfo
- *
- * Returns:
- *  r0, r3, r6, r7 corrupted
- *  r4 = physical page table address
- */
-	.ltorg
-
-	.align	2
-	.type	__switch_data, %object
-__switch_data:
-	.long	__mmap_switched
-	.long	__bss_start			@ r6
-	.long	_end				@ r7
-	.long	cr_alignment			@ r8
-	.long	init_thread_union + THREAD_START_SP @ sp
-
-/*
- * The following fragment of code is executed with the MMU on in MMU mode,
- * and uses absolute addresses; this is not position independent.
- *
- *  r0  = cp#0 control register
- */
-__mmap_switched:
-	adr	r3, __switch_data + 4
-
-	ldm.w	(r6, r7, r8), [r3]+
-	ldw	sp, [r3]
-
-	mov	fp, #0				@ Clear BSS (and zero fp)
-203:	csub.a	r6, r7
-	bea	204f
-	stw.w	fp, [r6]+,#4
-	b	203b
-204:
-	andn	r1, r0, #CR_A			@ Clear 'A' bit
-	stm	(r0, r1), [r8]+			@ Save control register values
-	b	start_kernel
-ENDPROC(__mmap_switched)
-
-/*
- * Exception handling.  Something went wrong and we can't proceed.  We
- * ought to tell the user, but since we don't have any guarantee that
- * we're even running on the right architecture, we do virtually nothing.
- *
- * If CONFIG_DEBUG_LL is set we try to print out something about the error
- * and hope for the best (useful if bootloader fails to pass a proper
- * machine ID for example).
- */
-__error_p:
-#ifdef CONFIG_DEBUG_LL
-	adr	r0, str_p1
-	b.l	printascii
-	mov	r0, r9
-	b.l	printhex8
-	adr	r0, str_p2
-	b.l	printascii
-901:	nop8
-	b	901b
-str_p1:	.asciz	"\nError: unrecognized processor variant (0x"
-str_p2:	.asciz	").\n"
-	.align
-#endif
-ENDPROC(__error_p)
-
diff --git a/arch/unicore32/kernel/hibernate.c b/arch/unicore32/kernel/hibernate.c
deleted file mode 100644
index 4cdf3c846a2d..000000000000
--- a/arch/unicore32/kernel/hibernate.c
+++ /dev/null
@@ -1,159 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- *  linux/arch/unicore32/kernel/hibernate.c
- *
- * Code specific to PKUnity SoC and UniCore ISA
- *
- *	Maintained by GUAN Xue-tao <gxt@mprc.pku.edu.cn>
- *	Copyright (C) 2001-2010 Guan Xuetao
- */
-
-#include <linux/gfp.h>
-#include <linux/suspend.h>
-#include <linux/memblock.h>
-#include <linux/pgtable.h>
-
-#include <asm/page.h>
-#include <asm/pgalloc.h>
-#include <asm/sections.h>
-#include <asm/suspend.h>
-
-#include "mach/pm.h"
-
-/* Pointer to the temporary resume page tables */
-pgd_t *resume_pg_dir;
-
-struct swsusp_arch_regs swsusp_arch_regs_cpu0;
-
-/*
- * Create a middle page table on a resume-safe page and put a pointer to it in
- * the given global directory entry.  This only returns the gd entry
- * in non-PAE compilation mode, since the middle layer is folded.
- */
-static pmd_t *resume_one_md_table_init(pgd_t *pgd)
-{
-	pud_t *pud;
-	p4d_t *p4d;
-	pmd_t *pmd_table;
-
-	p4d = p4d_offset(pgd, 0);
-	pud = pud_offset(p4d, 0);
-	pmd_table = pmd_offset(pud, 0);
-
-	return pmd_table;
-}
-
-/*
- * Create a page table on a resume-safe page and place a pointer to it in
- * a middle page directory entry.
- */
-static pte_t *resume_one_page_table_init(pmd_t *pmd)
-{
-	if (pmd_none(*pmd)) {
-		pte_t *page_table = (pte_t *)get_safe_page(GFP_ATOMIC);
-		if (!page_table)
-			return NULL;
-
-		set_pmd(pmd, __pmd(__pa(page_table) | _PAGE_KERNEL_TABLE));
-
-		BUG_ON(page_table != pte_offset_kernel(pmd, 0));
-
-		return page_table;
-	}
-
-	return pte_offset_kernel(pmd, 0);
-}
-
-/*
- * This maps the physical memory to kernel virtual address space, a total
- * of max_low_pfn pages, by creating page tables starting from address
- * PAGE_OFFSET.  The page tables are allocated out of resume-safe pages.
- */
-static int resume_physical_mapping_init(pgd_t *pgd_base)
-{
-	unsigned long pfn;
-	pgd_t *pgd;
-	pmd_t *pmd;
-	pte_t *pte;
-	int pgd_idx, pmd_idx;
-
-	pgd_idx = pgd_index(PAGE_OFFSET);
-	pgd = pgd_base + pgd_idx;
-	pfn = 0;
-
-	for (; pgd_idx < PTRS_PER_PGD; pgd++, pgd_idx++) {
-		pmd = resume_one_md_table_init(pgd);
-		if (!pmd)
-			return -ENOMEM;
-
-		if (pfn >= max_low_pfn)
-			continue;
-
-		for (pmd_idx = 0; pmd_idx < PTRS_PER_PMD; pmd++, pmd_idx++) {
-			pte_t *max_pte;
-
-			if (pfn >= max_low_pfn)
-				break;
-
-			/* Map with normal page tables.
-			 * NOTE: We can mark everything as executable here
-			 */
-			pte = resume_one_page_table_init(pmd);
-			if (!pte)
-				return -ENOMEM;
-
-			max_pte = pte + PTRS_PER_PTE;
-			for (; pte < max_pte; pte++, pfn++) {
-				if (pfn >= max_low_pfn)
-					break;
-
-				set_pte(pte, pfn_pte(pfn, PAGE_KERNEL_EXEC));
-			}
-		}
-	}
-
-	return 0;
-}
-
-static inline void resume_init_first_level_page_table(pgd_t *pg_dir)
-{
-}
-
-int swsusp_arch_resume(void)
-{
-	int error;
-
-	resume_pg_dir = (pgd_t *)get_safe_page(GFP_ATOMIC);
-	if (!resume_pg_dir)
-		return -ENOMEM;
-
-	resume_init_first_level_page_table(resume_pg_dir);
-	error = resume_physical_mapping_init(resume_pg_dir);
-	if (error)
-		return error;
-
-	/* We have got enough memory and from now on we cannot recover */
-	restore_image(resume_pg_dir, restore_pblist);
-	return 0;
-}
-
-/*
- *	pfn_is_nosave - check if given pfn is in the 'nosave' section
- */
-
-int pfn_is_nosave(unsigned long pfn)
-{
-	unsigned long begin_pfn = __pa(&__nosave_begin) >> PAGE_SHIFT;
-	unsigned long end_pfn = PAGE_ALIGN(__pa(&__nosave_end)) >> PAGE_SHIFT;
-
-	return (pfn >= begin_pfn) && (pfn < end_pfn);
-}
-
-void save_processor_state(void)
-{
-}
-
-void restore_processor_state(void)
-{
-	local_flush_tlb_all();
-}
diff --git a/arch/unicore32/kernel/hibernate_asm.S b/arch/unicore32/kernel/hibernate_asm.S
deleted file mode 100644
index a589bc189e24..000000000000
--- a/arch/unicore32/kernel/hibernate_asm.S
+++ /dev/null
@@ -1,114 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/*
- * linux/arch/unicore32/kernel/hibernate_asm.S
- *
- * Code specific to PKUnity SoC and UniCore ISA
- *
- *	Maintained by GUAN Xue-tao <gxt@mprc.pku.edu.cn>
- *	Copyright (C) 2001-2010 Guan Xuetao
- */
-
-#include <linux/sys.h>
-#include <linux/errno.h>
-#include <linux/linkage.h>
-#include <linux/pgtable.h>
-#include <generated/asm-offsets.h>
-#include <asm/page.h>
-#include <asm/assembler.h>
-
-@ restore_image(pgd_t *resume_pg_dir, struct pbe *restore_pblist)
-@ r0: resume_pg_dir
-@ r1: restore_pblist
-@ copy restore_pblist pages
-@ restore registers from swsusp_arch_regs_cpu0
-@
-ENTRY(restore_image)
-	sub	r0, r0, #PAGE_OFFSET
-	mov	r5, #0
-	movc	p0.c6, r5, #6	@invalidate ITLB & DTLB
-	movc	p0.c2, r0, #0
-	nop
-	nop
-	nop
-	nop
-	nop
-	nop
-	nop
-
-	.p2align 4,,7
-101:
-	csub.a	r1, #0
-	beq	109f
-
-	ldw	r6, [r1+], #PBE_ADDRESS
-	ldw	r7, [r1+], #PBE_ORIN_ADDRESS
-
-	movl	ip, #128
-102:	ldm.w	(r8 - r15), [r6]+
-	stm.w	(r8 - r15), [r7]+
-	sub.a	ip, ip, #1
-	bne	102b
-
-	ldw	r1, [r1+], #PBE_NEXT
-	b	101b
-
-	.p2align 4,,7
-109:
-	/* go back to the original page tables */
-	ldw	r0, =swapper_pg_dir
-	sub	r0, r0, #PAGE_OFFSET
-	mov	r5, #0
-	movc	p0.c6, r5, #6
-	movc	p0.c2, r0, #0
-	nop
-	nop
-	nop
-	nop
-	nop
-	nop
-	nop
-
-#ifdef	CONFIG_UNICORE_FPU_F64
-	ldw	ip, 1f
-	add	ip, ip, #SWSUSP_FPSTATE
-	lfm.w	(f0  - f7 ), [ip]+
-	lfm.w	(f8  - f15), [ip]+
-	lfm.w	(f16 - f23), [ip]+
-	lfm.w	(f24 - f31), [ip]+
-	ldw	r4, [ip]
-	ctf	r4, s31
-#endif
-	mov	r0, #0x0
-	ldw	ip, 1f
-	add	ip, ip, #SWSUSP_CPU
-	ldm.w	(r4 - r15), [ip]+
-	ldm	(r16 - r27, sp, pc), [ip]+	@ Load all regs saved previously
-
-	.align	2
-1:	.long	swsusp_arch_regs_cpu0
-
-
-@ swsusp_arch_suspend()
-@ - prepare pc for resume, return from function without swsusp_save on resume
-@ - save registers in swsusp_arch_regs_cpu0
-@ - call swsusp_save write suspend image
-
-ENTRY(swsusp_arch_suspend)
-	ldw	ip, 1f
-	add	ip, ip, #SWSUSP_CPU
-	stm.w	(r4 - r15), [ip]+
-	stm.w	(r16 - r27, sp, lr), [ip]+
-
-#ifdef	CONFIG_UNICORE_FPU_F64
-	ldw	ip, 1f
-	add	ip, ip, #SWSUSP_FPSTATE
-	sfm.w	(f0  - f7 ), [ip]+
-	sfm.w	(f8  - f15), [ip]+
-	sfm.w	(f16 - f23), [ip]+
-	sfm.w	(f24 - f31), [ip]+
-	cff	r4, s31
-	stw	r4, [ip]
-#endif
-	b	swsusp_save			@ no return
-
-1:	.long	swsusp_arch_regs_cpu0
diff --git a/arch/unicore32/kernel/irq.c b/arch/unicore32/kernel/irq.c
deleted file mode 100644
index c014ae3c3e48..000000000000
--- a/arch/unicore32/kernel/irq.c
+++ /dev/null
@@ -1,371 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * linux/arch/unicore32/kernel/irq.c
- *
- * Code specific to PKUnity SoC and UniCore ISA
- *
- * Copyright (C) 2001-2010 GUAN Xue-tao
- */
-#include <linux/kernel_stat.h>
-#include <linux/module.h>
-#include <linux/signal.h>
-#include <linux/ioport.h>
-#include <linux/interrupt.h>
-#include <linux/irq.h>
-#include <linux/random.h>
-#include <linux/smp.h>
-#include <linux/init.h>
-#include <linux/seq_file.h>
-#include <linux/errno.h>
-#include <linux/list.h>
-#include <linux/kallsyms.h>
-#include <linux/proc_fs.h>
-#include <linux/syscore_ops.h>
-
-#include <mach/hardware.h>
-
-#include "setup.h"
-
-/*
- * PKUnity GPIO edge detection for IRQs:
- * IRQs are generated on Falling-Edge, Rising-Edge, or both.
- * Use this instead of directly setting GRER/GFER.
- */
-static int GPIO_IRQ_rising_edge;
-static int GPIO_IRQ_falling_edge;
-static int GPIO_IRQ_mask = 0;
-
-#define GPIO_MASK(irq)		(1 << (irq - IRQ_GPIO0))
-
-static int puv3_gpio_type(struct irq_data *d, unsigned int type)
-{
-	unsigned int mask;
-
-	if (d->irq < IRQ_GPIOHIGH)
-		mask = 1 << d->irq;
-	else
-		mask = GPIO_MASK(d->irq);
-
-	if (type == IRQ_TYPE_PROBE) {
-		if ((GPIO_IRQ_rising_edge | GPIO_IRQ_falling_edge) & mask)
-			return 0;
-		type = IRQ_TYPE_EDGE_RISING | IRQ_TYPE_EDGE_FALLING;
-	}
-
-	if (type & IRQ_TYPE_EDGE_RISING)
-		GPIO_IRQ_rising_edge |= mask;
-	else
-		GPIO_IRQ_rising_edge &= ~mask;
-	if (type & IRQ_TYPE_EDGE_FALLING)
-		GPIO_IRQ_falling_edge |= mask;
-	else
-		GPIO_IRQ_falling_edge &= ~mask;
-
-	writel(GPIO_IRQ_rising_edge & GPIO_IRQ_mask, GPIO_GRER);
-	writel(GPIO_IRQ_falling_edge & GPIO_IRQ_mask, GPIO_GFER);
-
-	return 0;
-}
-
-/*
- * GPIO IRQs must be acknowledged.  This is for IRQs from 0 to 7.
- */
-static void puv3_low_gpio_ack(struct irq_data *d)
-{
-	writel((1 << d->irq), GPIO_GEDR);
-}
-
-static void puv3_low_gpio_mask(struct irq_data *d)
-{
-	writel(readl(INTC_ICMR) & ~(1 << d->irq), INTC_ICMR);
-}
-
-static void puv3_low_gpio_unmask(struct irq_data *d)
-{
-	writel(readl(INTC_ICMR) | (1 << d->irq), INTC_ICMR);
-}
-
-static int puv3_low_gpio_wake(struct irq_data *d, unsigned int on)
-{
-	if (on)
-		writel(readl(PM_PWER) | (1 << d->irq), PM_PWER);
-	else
-		writel(readl(PM_PWER) & ~(1 << d->irq), PM_PWER);
-	return 0;
-}
-
-static struct irq_chip puv3_low_gpio_chip = {
-	.name		= "GPIO-low",
-	.irq_ack	= puv3_low_gpio_ack,
-	.irq_mask	= puv3_low_gpio_mask,
-	.irq_unmask	= puv3_low_gpio_unmask,
-	.irq_set_type	= puv3_gpio_type,
-	.irq_set_wake	= puv3_low_gpio_wake,
-};
-
-/*
- * IRQ8 (GPIO0 through 27) handler.  We enter here with the
- * irq_controller_lock held, and IRQs disabled.  Decode the IRQ
- * and call the handler.
- */
-static void puv3_gpio_handler(struct irq_desc *desc)
-{
-	unsigned int mask, irq;
-
-	mask = readl(GPIO_GEDR);
-	do {
-		/*
-		 * clear down all currently active IRQ sources.
-		 * We will be processing them all.
-		 */
-		writel(mask, GPIO_GEDR);
-
-		irq = IRQ_GPIO0;
-		do {
-			if (mask & 1)
-				generic_handle_irq(irq);
-			mask >>= 1;
-			irq++;
-		} while (mask);
-		mask = readl(GPIO_GEDR);
-	} while (mask);
-}
-
-/*
- * GPIO0-27 edge IRQs need to be handled specially.
- * In addition, the IRQs are all collected up into one bit in the
- * interrupt controller registers.
- */
-static void puv3_high_gpio_ack(struct irq_data *d)
-{
-	unsigned int mask = GPIO_MASK(d->irq);
-
-	writel(mask, GPIO_GEDR);
-}
-
-static void puv3_high_gpio_mask(struct irq_data *d)
-{
-	unsigned int mask = GPIO_MASK(d->irq);
-
-	GPIO_IRQ_mask &= ~mask;
-
-	writel(readl(GPIO_GRER) & ~mask, GPIO_GRER);
-	writel(readl(GPIO_GFER) & ~mask, GPIO_GFER);
-}
-
-static void puv3_high_gpio_unmask(struct irq_data *d)
-{
-	unsigned int mask = GPIO_MASK(d->irq);
-
-	GPIO_IRQ_mask |= mask;
-
-	writel(GPIO_IRQ_rising_edge & GPIO_IRQ_mask, GPIO_GRER);
-	writel(GPIO_IRQ_falling_edge & GPIO_IRQ_mask, GPIO_GFER);
-}
-
-static int puv3_high_gpio_wake(struct irq_data *d, unsigned int on)
-{
-	if (on)
-		writel(readl(PM_PWER) | PM_PWER_GPIOHIGH, PM_PWER);
-	else
-		writel(readl(PM_PWER) & ~PM_PWER_GPIOHIGH, PM_PWER);
-	return 0;
-}
-
-static struct irq_chip puv3_high_gpio_chip = {
-	.name		= "GPIO-high",
-	.irq_ack	= puv3_high_gpio_ack,
-	.irq_mask	= puv3_high_gpio_mask,
-	.irq_unmask	= puv3_high_gpio_unmask,
-	.irq_set_type	= puv3_gpio_type,
-	.irq_set_wake	= puv3_high_gpio_wake,
-};
-
-/*
- * We don't need to ACK IRQs on the PKUnity unless they're GPIOs
- * this is for internal IRQs i.e. from 8 to 31.
- */
-static void puv3_mask_irq(struct irq_data *d)
-{
-	writel(readl(INTC_ICMR) & ~(1 << d->irq), INTC_ICMR);
-}
-
-static void puv3_unmask_irq(struct irq_data *d)
-{
-	writel(readl(INTC_ICMR) | (1 << d->irq), INTC_ICMR);
-}
-
-/*
- * Apart form GPIOs, only the RTC alarm can be a wakeup event.
- */
-static int puv3_set_wake(struct irq_data *d, unsigned int on)
-{
-	if (d->irq == IRQ_RTCAlarm) {
-		if (on)
-			writel(readl(PM_PWER) | PM_PWER_RTC, PM_PWER);
-		else
-			writel(readl(PM_PWER) & ~PM_PWER_RTC, PM_PWER);
-		return 0;
-	}
-	return -EINVAL;
-}
-
-static struct irq_chip puv3_normal_chip = {
-	.name		= "PKUnity-v3",
-	.irq_ack	= puv3_mask_irq,
-	.irq_mask	= puv3_mask_irq,
-	.irq_unmask	= puv3_unmask_irq,
-	.irq_set_wake	= puv3_set_wake,
-};
-
-static struct resource irq_resource = {
-	.name	= "irqs",
-	.start	= io_v2p(PKUNITY_INTC_BASE),
-	.end	= io_v2p(PKUNITY_INTC_BASE) + 0xFFFFF,
-};
-
-static struct puv3_irq_state {
-	unsigned int	saved;
-	unsigned int	icmr;
-	unsigned int	iclr;
-	unsigned int	iccr;
-} puv3_irq_state;
-
-static int puv3_irq_suspend(void)
-{
-	struct puv3_irq_state *st = &puv3_irq_state;
-
-	st->saved = 1;
-	st->icmr = readl(INTC_ICMR);
-	st->iclr = readl(INTC_ICLR);
-	st->iccr = readl(INTC_ICCR);
-
-	/*
-	 * Disable all GPIO-based interrupts.
-	 */
-	writel(readl(INTC_ICMR) & ~(0x1ff), INTC_ICMR);
-
-	/*
-	 * Set the appropriate edges for wakeup.
-	 */
-	writel(readl(PM_PWER) & GPIO_IRQ_rising_edge, GPIO_GRER);
-	writel(readl(PM_PWER) & GPIO_IRQ_falling_edge, GPIO_GFER);
-
-	/*
-	 * Clear any pending GPIO interrupts.
-	 */
-	writel(readl(GPIO_GEDR), GPIO_GEDR);
-
-	return 0;
-}
-
-static void puv3_irq_resume(void)
-{
-	struct puv3_irq_state *st = &puv3_irq_state;
-
-	if (st->saved) {
-		writel(st->iccr, INTC_ICCR);
-		writel(st->iclr, INTC_ICLR);
-
-		writel(GPIO_IRQ_rising_edge & GPIO_IRQ_mask, GPIO_GRER);
-		writel(GPIO_IRQ_falling_edge & GPIO_IRQ_mask, GPIO_GFER);
-
-		writel(st->icmr, INTC_ICMR);
-	}
-}
-
-static struct syscore_ops puv3_irq_syscore_ops = {
-	.suspend	= puv3_irq_suspend,
-	.resume		= puv3_irq_resume,
-};
-
-static int __init puv3_irq_init_syscore(void)
-{
-	register_syscore_ops(&puv3_irq_syscore_ops);
-	return 0;
-}
-
-device_initcall(puv3_irq_init_syscore);
-
-void __init init_IRQ(void)
-{
-	unsigned int irq;
-
-	request_resource(&iomem_resource, &irq_resource);
-
-	/* disable all IRQs */
-	writel(0, INTC_ICMR);
-
-	/* all IRQs are IRQ, not REAL */
-	writel(0, INTC_ICLR);
-
-	/* clear all GPIO edge detects */
-	writel(FMASK(8, 0) & ~FIELD(1, 1, GPI_SOFF_REQ), GPIO_GPIR);
-	writel(0, GPIO_GFER);
-	writel(0, GPIO_GRER);
-	writel(0x0FFFFFFF, GPIO_GEDR);
-
-	writel(1, INTC_ICCR);
-
-	for (irq = 0; irq < IRQ_GPIOHIGH; irq++) {
-		irq_set_chip(irq, &puv3_low_gpio_chip);
-		irq_set_handler(irq, handle_edge_irq);
-		irq_modify_status(irq,
-			IRQ_NOREQUEST | IRQ_NOPROBE | IRQ_NOAUTOEN,
-			0);
-	}
-
-	for (irq = IRQ_GPIOHIGH + 1; irq < IRQ_GPIO0; irq++) {
-		irq_set_chip(irq, &puv3_normal_chip);
-		irq_set_handler(irq, handle_level_irq);
-		irq_modify_status(irq,
-			IRQ_NOREQUEST | IRQ_NOAUTOEN,
-			IRQ_NOPROBE);
-	}
-
-	for (irq = IRQ_GPIO0; irq <= IRQ_GPIO27; irq++) {
-		irq_set_chip(irq, &puv3_high_gpio_chip);
-		irq_set_handler(irq, handle_edge_irq);
-		irq_modify_status(irq,
-			IRQ_NOREQUEST | IRQ_NOPROBE | IRQ_NOAUTOEN,
-			0);
-	}
-
-	/*
-	 * Install handler for GPIO 0-27 edge detect interrupts
-	 */
-	irq_set_chip(IRQ_GPIOHIGH, &puv3_normal_chip);
-	irq_set_chained_handler(IRQ_GPIOHIGH, puv3_gpio_handler);
-
-#ifdef CONFIG_PUV3_GPIO
-	puv3_init_gpio();
-#endif
-}
-
-/*
- * do_IRQ handles all hardware IRQ's.  Decoded IRQs should not
- * come via this function.  Instead, they should provide their
- * own 'handler'
- */
-asmlinkage void asm_do_IRQ(unsigned int irq, struct pt_regs *regs)
-{
-	struct pt_regs *old_regs = set_irq_regs(regs);
-
-	irq_enter();
-
-	/*
-	 * Some hardware gives randomly wrong interrupts.  Rather
-	 * than crashing, do something sensible.
-	 */
-	if (unlikely(irq >= nr_irqs)) {
-		if (printk_ratelimit())
-			printk(KERN_WARNING "Bad IRQ%u\n", irq);
-		ack_bad_irq(irq);
-	} else {
-		generic_handle_irq(irq);
-	}
-
-	irq_exit();
-	set_irq_regs(old_regs);
-}
-
diff --git a/arch/unicore32/kernel/ksyms.c b/arch/unicore32/kernel/ksyms.c
deleted file mode 100644
index 731445008932..000000000000
--- a/arch/unicore32/kernel/ksyms.c
+++ /dev/null
@@ -1,57 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * linux/arch/unicore32/kernel/ksyms.c
- *
- * Code specific to PKUnity SoC and UniCore ISA
- *
- * Copyright (C) 2001-2010 GUAN Xue-tao
- */
-#include <linux/module.h>
-#include <linux/sched.h>
-#include <linux/string.h>
-#include <linux/delay.h>
-#include <linux/in6.h>
-#include <linux/syscalls.h>
-#include <linux/uaccess.h>
-#include <linux/io.h>
-
-#include <asm/checksum.h>
-
-#include "ksyms.h"
-
-EXPORT_SYMBOL(find_first_bit);
-EXPORT_SYMBOL(find_first_zero_bit);
-EXPORT_SYMBOL(find_next_zero_bit);
-EXPORT_SYMBOL(find_next_bit);
-
-	/* platform dependent support */
-EXPORT_SYMBOL(__udelay);
-EXPORT_SYMBOL(__const_udelay);
-
-	/* string / mem functions */
-EXPORT_SYMBOL(strchr);
-EXPORT_SYMBOL(strrchr);
-EXPORT_SYMBOL(memset);
-EXPORT_SYMBOL(memcpy);
-EXPORT_SYMBOL(memmove);
-EXPORT_SYMBOL(memchr);
-
-	/* user mem (segment) */
-EXPORT_SYMBOL(__strnlen_user);
-EXPORT_SYMBOL(__strncpy_from_user);
-
-EXPORT_SYMBOL(copy_page);
-
-EXPORT_SYMBOL(raw_copy_from_user);
-EXPORT_SYMBOL(raw_copy_to_user);
-EXPORT_SYMBOL(__clear_user);
-
-EXPORT_SYMBOL(__ashldi3);
-EXPORT_SYMBOL(__ashrdi3);
-EXPORT_SYMBOL(__divsi3);
-EXPORT_SYMBOL(__lshrdi3);
-EXPORT_SYMBOL(__modsi3);
-EXPORT_SYMBOL(__ucmpdi2);
-EXPORT_SYMBOL(__udivsi3);
-EXPORT_SYMBOL(__umodsi3);
-
diff --git a/arch/unicore32/kernel/ksyms.h b/arch/unicore32/kernel/ksyms.h
deleted file mode 100644
index 5d2d5ba324ac..000000000000
--- a/arch/unicore32/kernel/ksyms.h
+++ /dev/null
@@ -1,14 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * libgcc functions - functions that are used internally by the
- * compiler...  (prototypes are not correct though, but that
- * doesn't really matter since they're not versioned).
- */
-extern void __ashldi3(void);
-extern void __ashrdi3(void);
-extern void __divsi3(void);
-extern void __lshrdi3(void);
-extern void __modsi3(void);
-extern void __ucmpdi2(void);
-extern void __udivsi3(void);
-extern void __umodsi3(void);
diff --git a/arch/unicore32/kernel/module.c b/arch/unicore32/kernel/module.c
deleted file mode 100644
index 67c89ef2d6ee..000000000000
--- a/arch/unicore32/kernel/module.c
+++ /dev/null
@@ -1,105 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * linux/arch/unicore32/kernel/module.c
- *
- * Code specific to PKUnity SoC and UniCore ISA
- *
- * Copyright (C) 2001-2010 GUAN Xue-tao
- */
-#include <linux/module.h>
-#include <linux/moduleloader.h>
-#include <linux/kernel.h>
-#include <linux/mm.h>
-#include <linux/elf.h>
-#include <linux/vmalloc.h>
-#include <linux/fs.h>
-#include <linux/string.h>
-#include <linux/gfp.h>
-
-#include <asm/sections.h>
-
-void *module_alloc(unsigned long size)
-{
-	return __vmalloc_node_range(size, 1, MODULES_VADDR, MODULES_END,
-				GFP_KERNEL, PAGE_KERNEL_EXEC, 0, NUMA_NO_NODE,
-				__builtin_return_address(0));
-}
-
-int
-apply_relocate(Elf32_Shdr *sechdrs, const char *strtab, unsigned int symindex,
-	       unsigned int relindex, struct module *module)
-{
-	Elf32_Shdr *symsec = sechdrs + symindex;
-	Elf32_Shdr *relsec = sechdrs + relindex;
-	Elf32_Shdr *dstsec = sechdrs + relsec->sh_info;
-	Elf32_Rel *rel = (void *)relsec->sh_addr;
-	unsigned int i;
-
-	for (i = 0; i < relsec->sh_size / sizeof(Elf32_Rel); i++, rel++) {
-		unsigned long loc;
-		Elf32_Sym *sym;
-		s32 offset;
-
-		offset = ELF32_R_SYM(rel->r_info);
-		if (offset < 0 || offset >
-				(symsec->sh_size / sizeof(Elf32_Sym))) {
-			printk(KERN_ERR "%s: bad relocation, "
-					"section %d reloc %d\n",
-					module->name, relindex, i);
-			return -ENOEXEC;
-		}
-
-		sym = ((Elf32_Sym *)symsec->sh_addr) + offset;
-
-		if (rel->r_offset < 0 || rel->r_offset >
-				dstsec->sh_size - sizeof(u32)) {
-			printk(KERN_ERR "%s: out of bounds relocation, "
-				"section %d reloc %d offset %d size %d\n",
-				module->name, relindex, i, rel->r_offset,
-				dstsec->sh_size);
-			return -ENOEXEC;
-		}
-
-		loc = dstsec->sh_addr + rel->r_offset;
-
-		switch (ELF32_R_TYPE(rel->r_info)) {
-		case R_UNICORE_NONE:
-			/* ignore */
-			break;
-
-		case R_UNICORE_ABS32:
-			*(u32 *)loc += sym->st_value;
-			break;
-
-		case R_UNICORE_PC24:
-		case R_UNICORE_CALL:
-		case R_UNICORE_JUMP24:
-			offset = (*(u32 *)loc & 0x00ffffff) << 2;
-			if (offset & 0x02000000)
-				offset -= 0x04000000;
-
-			offset += sym->st_value - loc;
-			if (offset & 3 ||
-			    offset <= (s32)0xfe000000 ||
-			    offset >= (s32)0x02000000) {
-				printk(KERN_ERR
-				       "%s: relocation out of range, section "
-				       "%d reloc %d sym '%s'\n", module->name,
-				       relindex, i, strtab + sym->st_name);
-				return -ENOEXEC;
-			}
-
-			offset >>= 2;
-
-			*(u32 *)loc &= 0xff000000;
-			*(u32 *)loc |= offset & 0x00ffffff;
-			break;
-
-		default:
-			printk(KERN_ERR "%s: unknown relocation: %u\n",
-			       module->name, ELF32_R_TYPE(rel->r_info));
-			return -ENOEXEC;
-		}
-	}
-	return 0;
-}
diff --git a/arch/unicore32/kernel/pci.c b/arch/unicore32/kernel/pci.c
deleted file mode 100644
index 0d098aa05b47..000000000000
--- a/arch/unicore32/kernel/pci.c
+++ /dev/null
@@ -1,371 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * linux/arch/unicore32/kernel/pci.c
- *
- * Code specific to PKUnity SoC and UniCore ISA
- *
- * Copyright (C) 2001-2010 GUAN Xue-tao
- *
- *  PCI bios-type initialisation for PCI machines
- */
-#include <linux/module.h>
-#include <linux/kernel.h>
-#include <linux/interrupt.h>
-#include <linux/pci.h>
-#include <linux/slab.h>
-#include <linux/init.h>
-#include <linux/io.h>
-
-static int debug_pci;
-
-#define CONFIG_CMD(bus, devfn, where)	\
-	(0x80000000 | (bus->number << 16) | (devfn << 8) | (where & ~3))
-
-static int
-puv3_read_config(struct pci_bus *bus, unsigned int devfn, int where,
-			int size, u32 *value)
-{
-	writel(CONFIG_CMD(bus, devfn, where), PCICFG_ADDR);
-	switch (size) {
-	case 1:
-		*value = (readl(PCICFG_DATA) >> ((where & 3) * 8)) & 0xFF;
-		break;
-	case 2:
-		*value = (readl(PCICFG_DATA) >> ((where & 2) * 8)) & 0xFFFF;
-		break;
-	case 4:
-		*value = readl(PCICFG_DATA);
-		break;
-	}
-	return PCIBIOS_SUCCESSFUL;
-}
-
-static int
-puv3_write_config(struct pci_bus *bus, unsigned int devfn, int where,
-			int size, u32 value)
-{
-	writel(CONFIG_CMD(bus, devfn, where), PCICFG_ADDR);
-	switch (size) {
-	case 1:
-		writel((readl(PCICFG_DATA) & ~FMASK(8, (where&3)*8))
-			| FIELD(value, 8, (where&3)*8), PCICFG_DATA);
-		break;
-	case 2:
-		writel((readl(PCICFG_DATA) & ~FMASK(16, (where&2)*8))
-			| FIELD(value, 16, (where&2)*8), PCICFG_DATA);
-		break;
-	case 4:
-		writel(value, PCICFG_DATA);
-		break;
-	}
-	return PCIBIOS_SUCCESSFUL;
-}
-
-struct pci_ops pci_puv3_ops = {
-	.read  = puv3_read_config,
-	.write = puv3_write_config,
-};
-
-void pci_puv3_preinit(void)
-{
-	printk(KERN_DEBUG "PCI: PKUnity PCI Controller Initializing ...\n");
-	/* config PCI bridge base */
-	writel(io_v2p(PKUNITY_PCIBRI_BASE), PCICFG_BRIBASE);
-
-	writel(0, PCIBRI_AHBCTL0);
-	writel(io_v2p(PKUNITY_PCIBRI_BASE) | PCIBRI_BARx_MEM, PCIBRI_AHBBAR0);
-	writel(0xFFFF0000, PCIBRI_AHBAMR0);
-	writel(0, PCIBRI_AHBTAR0);
-
-	writel(PCIBRI_CTLx_AT, PCIBRI_AHBCTL1);
-	writel(io_v2p(PKUNITY_PCILIO_BASE) | PCIBRI_BARx_IO, PCIBRI_AHBBAR1);
-	writel(0xFFFF0000, PCIBRI_AHBAMR1);
-	writel(0x00000000, PCIBRI_AHBTAR1);
-
-	writel(PCIBRI_CTLx_PREF, PCIBRI_AHBCTL2);
-	writel(io_v2p(PKUNITY_PCIMEM_BASE) | PCIBRI_BARx_MEM, PCIBRI_AHBBAR2);
-	writel(0xF8000000, PCIBRI_AHBAMR2);
-	writel(0, PCIBRI_AHBTAR2);
-
-	writel(io_v2p(PKUNITY_PCIAHB_BASE) | PCIBRI_BARx_MEM, PCIBRI_BAR1);
-
-	writel(PCIBRI_CTLx_AT | PCIBRI_CTLx_PREF, PCIBRI_PCICTL0);
-	writel(io_v2p(PKUNITY_PCIAHB_BASE) | PCIBRI_BARx_MEM, PCIBRI_PCIBAR0);
-	writel(0xF8000000, PCIBRI_PCIAMR0);
-	writel(PKUNITY_SDRAM_BASE, PCIBRI_PCITAR0);
-
-	writel(readl(PCIBRI_CMD) | PCIBRI_CMD_IO | PCIBRI_CMD_MEM, PCIBRI_CMD);
-}
-
-static int pci_puv3_map_irq(const struct pci_dev *dev, u8 slot, u8 pin)
-{
-	if (dev->bus->number == 0) {
-#ifdef CONFIG_ARCH_FPGA /* 4 pci slots */
-		if      (dev->devfn == 0x00)
-			return IRQ_PCIINTA;
-		else if (dev->devfn == 0x08)
-			return IRQ_PCIINTB;
-		else if (dev->devfn == 0x10)
-			return IRQ_PCIINTC;
-		else if (dev->devfn == 0x18)
-			return IRQ_PCIINTD;
-#endif
-#ifdef CONFIG_PUV3_DB0913 /* 3 pci slots */
-		if      (dev->devfn == 0x30)
-			return IRQ_PCIINTB;
-		else if (dev->devfn == 0x60)
-			return IRQ_PCIINTC;
-		else if (dev->devfn == 0x58)
-			return IRQ_PCIINTD;
-#endif
-#if	defined(CONFIG_PUV3_NB0916) || defined(CONFIG_PUV3_SMW0919)
-		/* only support 2 pci devices */
-		if      (dev->devfn == 0x00)
-			return IRQ_PCIINTC; /* sata */
-#endif
-	}
-	return -1;
-}
-
-/*
- * Only first 128MB of memory can be accessed via PCI.
- * We use GFP_DMA to allocate safe buffers to do map/unmap.
- * This is really ugly and we need a better way of specifying
- * DMA-capable regions of memory.
- */
-void __init puv3_pci_adjust_zones(unsigned long max_zone_pfn)
-{
-	unsigned int sz = SZ_128M >> PAGE_SHIFT;
-
-	max_zone_pfn[ZONE_DMA] = sz;
-}
-
-/*
- * If the bus contains any of these devices, then we must not turn on
- * parity checking of any kind.
- */
-static inline int pdev_bad_for_parity(struct pci_dev *dev)
-{
-	return 0;
-}
-
-/*
- * pcibios_fixup_bus - Called after each bus is probed,
- * but before its children are examined.
- */
-void pcibios_fixup_bus(struct pci_bus *bus)
-{
-	struct pci_dev *dev;
-	u16 features = PCI_COMMAND_SERR
-		| PCI_COMMAND_PARITY
-		| PCI_COMMAND_FAST_BACK;
-
-	bus->resource[0] = &ioport_resource;
-	bus->resource[1] = &iomem_resource;
-
-	/*
-	 * Walk the devices on this bus, working out what we can
-	 * and can't support.
-	 */
-	list_for_each_entry(dev, &bus->devices, bus_list) {
-		u16 status;
-
-		pci_read_config_word(dev, PCI_STATUS, &status);
-
-		/*
-		 * If any device on this bus does not support fast back
-		 * to back transfers, then the bus as a whole is not able
-		 * to support them.  Having fast back to back transfers
-		 * on saves us one PCI cycle per transaction.
-		 */
-		if (!(status & PCI_STATUS_FAST_BACK))
-			features &= ~PCI_COMMAND_FAST_BACK;
-
-		if (pdev_bad_for_parity(dev))
-			features &= ~(PCI_COMMAND_SERR
-					| PCI_COMMAND_PARITY);
-
-		switch (dev->class >> 8) {
-		case PCI_CLASS_BRIDGE_PCI:
-			pci_read_config_word(dev, PCI_BRIDGE_CONTROL, &status);
-			status |= PCI_BRIDGE_CTL_PARITY
-				| PCI_BRIDGE_CTL_MASTER_ABORT;
-			status &= ~(PCI_BRIDGE_CTL_BUS_RESET
-				| PCI_BRIDGE_CTL_FAST_BACK);
-			pci_write_config_word(dev, PCI_BRIDGE_CONTROL, status);
-			break;
-
-		case PCI_CLASS_BRIDGE_CARDBUS:
-			pci_read_config_word(dev, PCI_CB_BRIDGE_CONTROL,
-					&status);
-			status |= PCI_CB_BRIDGE_CTL_PARITY
-				| PCI_CB_BRIDGE_CTL_MASTER_ABORT;
-			pci_write_config_word(dev, PCI_CB_BRIDGE_CONTROL,
-					status);
-			break;
-		}
-	}
-
-	/*
-	 * Now walk the devices again, this time setting them up.
-	 */
-	list_for_each_entry(dev, &bus->devices, bus_list) {
-		u16 cmd;
-
-		pci_read_config_word(dev, PCI_COMMAND, &cmd);
-		cmd |= features;
-		pci_write_config_word(dev, PCI_COMMAND, cmd);
-
-		pci_write_config_byte(dev, PCI_CACHE_LINE_SIZE,
-				      L1_CACHE_BYTES >> 2);
-	}
-
-	/*
-	 * Propagate the flags to the PCI bridge.
-	 */
-	if (bus->self && bus->self->hdr_type == PCI_HEADER_TYPE_BRIDGE) {
-		if (features & PCI_COMMAND_FAST_BACK)
-			bus->bridge_ctl |= PCI_BRIDGE_CTL_FAST_BACK;
-		if (features & PCI_COMMAND_PARITY)
-			bus->bridge_ctl |= PCI_BRIDGE_CTL_PARITY;
-	}
-
-	/*
-	 * Report what we did for this bus
-	 */
-	printk(KERN_INFO "PCI: bus%d: Fast back to back transfers %sabled\n",
-		bus->number, (features & PCI_COMMAND_FAST_BACK) ? "en" : "dis");
-}
-EXPORT_SYMBOL(pcibios_fixup_bus);
-
-static struct resource busn_resource = {
-	.name	= "PCI busn",
-	.start	= 0,
-	.end	= 255,
-	.flags	= IORESOURCE_BUS,
-};
-
-static int __init pci_common_init(void)
-{
-	struct pci_bus *puv3_bus;
-	struct pci_host_bridge *bridge;
-	int ret;
-
-	bridge = pci_alloc_host_bridge(0);
-	if (!bridge)
-		return -ENOMEM;
-
-	pci_puv3_preinit();
-
-	pci_add_resource(&bridge->windows, &ioport_resource);
-	pci_add_resource(&bridge->windows, &iomem_resource);
-	pci_add_resource(&bridge->windows, &busn_resource);
-	bridge->sysdata = NULL;
-	bridge->busnr = 0;
-	bridge->ops = &pci_puv3_ops;
-	bridge->swizzle_irq = pci_common_swizzle;
-	bridge->map_irq = pci_puv3_map_irq;
-
-	/* Scan our single hose.  */
-	ret = pci_scan_root_bus_bridge(bridge);
-	if (ret) {
-		pci_free_host_bridge(bridge);
-		return;
-	}
-
-	puv3_bus = bridge->bus;
-
-	if (!puv3_bus)
-		panic("PCI: unable to scan bus!");
-
-	pci_bus_size_bridges(puv3_bus);
-	pci_bus_assign_resources(puv3_bus);
-	pci_bus_add_devices(puv3_bus);
-	return 0;
-}
-subsys_initcall(pci_common_init);
-
-char * __init pcibios_setup(char *str)
-{
-	if (!strcmp(str, "debug")) {
-		debug_pci = 1;
-		return NULL;
-	}
-	return str;
-}
-
-void pcibios_set_master(struct pci_dev *dev)
-{
-	/* No special bus mastering setup handling */
-}
-
-/*
- * From arch/i386/kernel/pci-i386.c:
- *
- * We need to avoid collisions with `mirrored' VGA ports
- * and other strange ISA hardware, so we always want the
- * addresses to be allocated in the 0x000-0x0ff region
- * modulo 0x400.
- *
- * Why? Because some silly external IO cards only decode
- * the low 10 bits of the IO address. The 0x00-0xff region
- * is reserved for motherboard devices that decode all 16
- * bits, so it's ok to allocate at, say, 0x2800-0x28ff,
- * but we want to try to avoid allocating at 0x2900-0x2bff
- * which might be mirrored at 0x0100-0x03ff..
- */
-resource_size_t pcibios_align_resource(void *data, const struct resource *res,
-				resource_size_t size, resource_size_t align)
-{
-	resource_size_t start = res->start;
-
-	if (res->flags & IORESOURCE_IO && start & 0x300)
-		start = (start + 0x3ff) & ~0x3ff;
-
-	start = (start + align - 1) & ~(align - 1);
-
-	return start;
-}
-
-/**
- * pcibios_enable_device - Enable I/O and memory.
- * @dev: PCI device to be enabled
- */
-int pcibios_enable_device(struct pci_dev *dev, int mask)
-{
-	u16 cmd, old_cmd;
-	int idx;
-	struct resource *r;
-
-	pci_read_config_word(dev, PCI_COMMAND, &cmd);
-	old_cmd = cmd;
-	for (idx = 0; idx < 6; idx++) {
-		/* Only set up the requested stuff */
-		if (!(mask & (1 << idx)))
-			continue;
-
-		r = dev->resource + idx;
-		if (!r->start && r->end) {
-			printk(KERN_ERR "PCI: Device %s not available because"
-			       " of resource collisions\n", pci_name(dev));
-			return -EINVAL;
-		}
-		if (r->flags & IORESOURCE_IO)
-			cmd |= PCI_COMMAND_IO;
-		if (r->flags & IORESOURCE_MEM)
-			cmd |= PCI_COMMAND_MEMORY;
-	}
-
-	/*
-	 * Bridges (eg, cardbus bridges) need to be fully enabled
-	 */
-	if ((dev->class >> 16) == PCI_BASE_CLASS_BRIDGE)
-		cmd |= PCI_COMMAND_IO | PCI_COMMAND_MEMORY;
-
-	if (cmd != old_cmd) {
-		printk("PCI: enabling device %s (%04x -> %04x)\n",
-		       pci_name(dev), old_cmd, cmd);
-		pci_write_config_word(dev, PCI_COMMAND, cmd);
-	}
-	return 0;
-}
diff --git a/arch/unicore32/kernel/pm.c b/arch/unicore32/kernel/pm.c
deleted file mode 100644
index 94b7f9df6c1a..000000000000
--- a/arch/unicore32/kernel/pm.c
+++ /dev/null
@@ -1,121 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * linux/arch/unicore32/kernel/pm.c
- *
- * Code specific to PKUnity SoC and UniCore ISA
- *
- *	Maintained by GUAN Xue-tao <gxt@mprc.pku.edu.cn>
- *	Copyright (C) 2001-2010 Guan Xuetao
- */
-#include <linux/init.h>
-#include <linux/module.h>
-#include <linux/suspend.h>
-#include <linux/errno.h>
-#include <linux/slab.h>
-#include <linux/io.h>
-
-#include <mach/hardware.h>
-#include <mach/pm.h>
-
-#include "setup.h"
-
-struct puv3_cpu_pm_fns *puv3_cpu_pm_fns;
-static unsigned long *sleep_save;
-
-int puv3_pm_enter(suspend_state_t state)
-{
-	unsigned long sleep_save_checksum = 0, checksum = 0;
-	int i;
-
-	/* skip registers saving for standby */
-	if (state != PM_SUSPEND_STANDBY) {
-		puv3_cpu_pm_fns->save(sleep_save);
-		/* before sleeping, calculate and save a checksum */
-		for (i = 0; i < puv3_cpu_pm_fns->save_count - 1; i++)
-			sleep_save_checksum += sleep_save[i];
-	}
-
-	/* *** go zzz *** */
-	puv3_cpu_pm_fns->enter(state);
-	cpu_init();
-#ifdef CONFIG_INPUT_KEYBOARD
-	puv3_ps2_init();
-#endif
-#ifdef CONFIG_PCI
-	pci_puv3_preinit();
-#endif
-	if (state != PM_SUSPEND_STANDBY) {
-		/* after sleeping, validate the checksum */
-		for (i = 0; i < puv3_cpu_pm_fns->save_count - 1; i++)
-			checksum += sleep_save[i];
-
-		/* if invalid, display message and wait for a hardware reset */
-		if (checksum != sleep_save_checksum) {
-			while (1)
-				puv3_cpu_pm_fns->enter(state);
-		}
-		puv3_cpu_pm_fns->restore(sleep_save);
-	}
-
-	pr_debug("*** made it back from resume\n");
-
-	return 0;
-}
-EXPORT_SYMBOL_GPL(puv3_pm_enter);
-
-unsigned long sleep_phys_sp(void *sp)
-{
-	return virt_to_phys(sp);
-}
-
-static int puv3_pm_valid(suspend_state_t state)
-{
-	if (puv3_cpu_pm_fns)
-		return puv3_cpu_pm_fns->valid(state);
-
-	return -EINVAL;
-}
-
-static int puv3_pm_prepare(void)
-{
-	int ret = 0;
-
-	if (puv3_cpu_pm_fns && puv3_cpu_pm_fns->prepare)
-		ret = puv3_cpu_pm_fns->prepare();
-
-	return ret;
-}
-
-static void puv3_pm_finish(void)
-{
-	if (puv3_cpu_pm_fns && puv3_cpu_pm_fns->finish)
-		puv3_cpu_pm_fns->finish();
-}
-
-static struct platform_suspend_ops puv3_pm_ops = {
-	.valid		= puv3_pm_valid,
-	.enter		= puv3_pm_enter,
-	.prepare	= puv3_pm_prepare,
-	.finish		= puv3_pm_finish,
-};
-
-static int __init puv3_pm_init(void)
-{
-	if (!puv3_cpu_pm_fns) {
-		printk(KERN_ERR "no valid puv3_cpu_pm_fns defined\n");
-		return -EINVAL;
-	}
-
-	sleep_save = kmalloc_array(puv3_cpu_pm_fns->save_count,
-				   sizeof(unsigned long),
-				   GFP_KERNEL);
-	if (!sleep_save) {
-		printk(KERN_ERR "failed to alloc memory for pm save\n");
-		return -ENOMEM;
-	}
-
-	suspend_set_ops(&puv3_pm_ops);
-	return 0;
-}
-
-device_initcall(puv3_pm_init);
diff --git a/arch/unicore32/kernel/process.c b/arch/unicore32/kernel/process.c
deleted file mode 100644
index b4fd3a604a18..000000000000
--- a/arch/unicore32/kernel/process.c
+++ /dev/null
@@ -1,319 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * linux/arch/unicore32/kernel/process.c
- *
- * Code specific to PKUnity SoC and UniCore ISA
- *
- * Copyright (C) 2001-2010 GUAN Xue-tao
- */
-#include <stdarg.h>
-
-#include <linux/module.h>
-#include <linux/sched.h>
-#include <linux/sched/debug.h>
-#include <linux/sched/task.h>
-#include <linux/sched/task_stack.h>
-#include <linux/kernel.h>
-#include <linux/mm.h>
-#include <linux/stddef.h>
-#include <linux/unistd.h>
-#include <linux/delay.h>
-#include <linux/reboot.h>
-#include <linux/interrupt.h>
-#include <linux/init.h>
-#include <linux/cpu.h>
-#include <linux/elfcore.h>
-#include <linux/pm.h>
-#include <linux/tick.h>
-#include <linux/utsname.h>
-#include <linux/uaccess.h>
-#include <linux/random.h>
-#include <linux/gpio.h>
-#include <linux/stacktrace.h>
-
-#include <asm/cacheflush.h>
-#include <asm/processor.h>
-#include <asm/stacktrace.h>
-
-#include "setup.h"
-
-static const char * const processor_modes[] = {
-	"UK00", "UK01", "UK02", "UK03", "UK04", "UK05", "UK06", "UK07",
-	"UK08", "UK09", "UK0A", "UK0B", "UK0C", "UK0D", "UK0E", "UK0F",
-	"USER", "REAL", "INTR", "PRIV", "UK14", "UK15", "UK16", "ABRT",
-	"UK18", "UK19", "UK1A", "EXTN", "UK1C", "UK1D", "UK1E", "SUSR"
-};
-
-void arch_cpu_idle(void)
-{
-	cpu_do_idle();
-	local_irq_enable();
-}
-
-void machine_halt(void)
-{
-	gpio_set_value(GPO_SOFT_OFF, 0);
-}
-
-/*
- * Function pointers to optional machine specific functions
- */
-void (*pm_power_off)(void) = NULL;
-EXPORT_SYMBOL(pm_power_off);
-
-void machine_power_off(void)
-{
-	if (pm_power_off)
-		pm_power_off();
-	machine_halt();
-}
-
-void machine_restart(char *cmd)
-{
-	/* Disable interrupts first */
-	local_irq_disable();
-
-	/*
-	 * Tell the mm system that we are going to reboot -
-	 * we may need it to insert some 1:1 mappings so that
-	 * soft boot works.
-	 */
-	setup_mm_for_reboot();
-
-	/* Clean and invalidate caches */
-	flush_cache_all();
-
-	/* Turn off caching */
-	cpu_proc_fin();
-
-	/* Push out any further dirty data, and ensure cache is empty */
-	flush_cache_all();
-
-	/*
-	 * Now handle reboot code.
-	 */
-	if (reboot_mode == REBOOT_SOFT) {
-		/* Jump into ROM at address 0xffff0000 */
-		cpu_reset(VECTORS_BASE);
-	} else {
-		writel(0x00002001, PM_PLLSYSCFG); /* cpu clk = 250M */
-		writel(0x00100800, PM_PLLDDRCFG); /* ddr clk =  44M */
-		writel(0x00002001, PM_PLLVGACFG); /* vga clk = 250M */
-
-		/* Use on-chip reset capability */
-		/* following instructions must be in one icache line */
-		__asm__ __volatile__(
-			"	.align 5\n\t"
-			"	stw	%1, [%0]\n\t"
-			"201:	ldw	r0, [%0]\n\t"
-			"	cmpsub.a	r0, #0\n\t"
-			"	bne	201b\n\t"
-			"	stw	%3, [%2]\n\t"
-			"	nop; nop; nop\n\t"
-			/* prefetch 3 instructions at most */
-			:
-			: "r" (PM_PMCR),
-			  "r" (PM_PMCR_CFBSYS | PM_PMCR_CFBDDR
-				| PM_PMCR_CFBVGA),
-			  "r" (RESETC_SWRR),
-			  "r" (RESETC_SWRR_SRB)
-			: "r0", "memory");
-	}
-
-	/*
-	 * Whoops - the architecture was unable to reboot.
-	 * Tell the user!
-	 */
-	mdelay(1000);
-	printk(KERN_EMERG "Reboot failed -- System halted\n");
-	do { } while (1);
-}
-
-void __show_regs(struct pt_regs *regs)
-{
-	unsigned long flags;
-	char buf[64];
-
-	show_regs_print_info(KERN_DEFAULT);
-	printk("PC is at %pS\n", (void *)instruction_pointer(regs));
-	printk("LR is at %pS\n", (void *)regs->UCreg_lr);
-	printk(KERN_DEFAULT "pc : [<%08lx>]    lr : [<%08lx>]    psr: %08lx\n"
-	       "sp : %08lx  ip : %08lx  fp : %08lx\n",
-		regs->UCreg_pc, regs->UCreg_lr, regs->UCreg_asr,
-		regs->UCreg_sp, regs->UCreg_ip, regs->UCreg_fp);
-	printk(KERN_DEFAULT "r26: %08lx  r25: %08lx  r24: %08lx\n",
-		regs->UCreg_26, regs->UCreg_25,
-		regs->UCreg_24);
-	printk(KERN_DEFAULT "r23: %08lx  r22: %08lx  r21: %08lx  r20: %08lx\n",
-		regs->UCreg_23, regs->UCreg_22,
-		regs->UCreg_21, regs->UCreg_20);
-	printk(KERN_DEFAULT "r19: %08lx  r18: %08lx  r17: %08lx  r16: %08lx\n",
-		regs->UCreg_19, regs->UCreg_18,
-		regs->UCreg_17, regs->UCreg_16);
-	printk(KERN_DEFAULT "r15: %08lx  r14: %08lx  r13: %08lx  r12: %08lx\n",
-		regs->UCreg_15, regs->UCreg_14,
-		regs->UCreg_13, regs->UCreg_12);
-	printk(KERN_DEFAULT "r11: %08lx  r10: %08lx  r9 : %08lx  r8 : %08lx\n",
-		regs->UCreg_11, regs->UCreg_10,
-		regs->UCreg_09, regs->UCreg_08);
-	printk(KERN_DEFAULT "r7 : %08lx  r6 : %08lx  r5 : %08lx  r4 : %08lx\n",
-		regs->UCreg_07, regs->UCreg_06,
-		regs->UCreg_05, regs->UCreg_04);
-	printk(KERN_DEFAULT "r3 : %08lx  r2 : %08lx  r1 : %08lx  r0 : %08lx\n",
-		regs->UCreg_03, regs->UCreg_02,
-		regs->UCreg_01, regs->UCreg_00);
-
-	flags = regs->UCreg_asr;
-	buf[0] = flags & PSR_S_BIT ? 'S' : 's';
-	buf[1] = flags & PSR_Z_BIT ? 'Z' : 'z';
-	buf[2] = flags & PSR_C_BIT ? 'C' : 'c';
-	buf[3] = flags & PSR_V_BIT ? 'V' : 'v';
-	buf[4] = '\0';
-
-	printk(KERN_DEFAULT "Flags: %s  INTR o%s  REAL o%s  Mode %s  Segment %s\n",
-		buf, interrupts_enabled(regs) ? "n" : "ff",
-		fast_interrupts_enabled(regs) ? "n" : "ff",
-		processor_modes[processor_mode(regs)],
-		uaccess_kernel() ? "kernel" : "user");
-	{
-		unsigned int ctrl;
-
-		buf[0] = '\0';
-		{
-			unsigned int transbase;
-			asm("movc %0, p0.c2, #0\n"
-			    : "=r" (transbase));
-			snprintf(buf, sizeof(buf), "  Table: %08x", transbase);
-		}
-		asm("movc %0, p0.c1, #0\n" : "=r" (ctrl));
-
-		printk(KERN_DEFAULT "Control: %08x%s\n", ctrl, buf);
-	}
-}
-
-void show_regs(struct pt_regs *regs)
-{
-	printk(KERN_DEFAULT "\n");
-	printk(KERN_DEFAULT "Pid: %d, comm: %20s\n",
-			task_pid_nr(current), current->comm);
-	__show_regs(regs);
-	__backtrace();
-}
-
-void flush_thread(void)
-{
-	struct thread_info *thread = current_thread_info();
-	struct task_struct *tsk = current;
-
-	memset(thread->used_cp, 0, sizeof(thread->used_cp));
-	memset(&tsk->thread.debug, 0, sizeof(struct debug_info));
-#ifdef CONFIG_UNICORE_FPU_F64
-	memset(&thread->fpstate, 0, sizeof(struct fp_state));
-#endif
-}
-
-void release_thread(struct task_struct *dead_task)
-{
-}
-
-asmlinkage void ret_from_fork(void) __asm__("ret_from_fork");
-asmlinkage void ret_from_kernel_thread(void) __asm__("ret_from_kernel_thread");
-
-int
-copy_thread(unsigned long clone_flags, unsigned long stack_start,
-	    unsigned long stk_sz, struct task_struct *p)
-{
-	struct thread_info *thread = task_thread_info(p);
-	struct pt_regs *childregs = task_pt_regs(p);
-
-	memset(&thread->cpu_context, 0, sizeof(struct cpu_context_save));
-	thread->cpu_context.sp = (unsigned long)childregs;
-	if (unlikely(p->flags & PF_KTHREAD)) {
-		thread->cpu_context.pc = (unsigned long)ret_from_kernel_thread;
-		thread->cpu_context.r4 = stack_start;
-		thread->cpu_context.r5 = stk_sz;
-		memset(childregs, 0, sizeof(struct pt_regs));
-	} else {
-		thread->cpu_context.pc = (unsigned long)ret_from_fork;
-		*childregs = *current_pt_regs();
-		childregs->UCreg_00 = 0;
-		if (stack_start)
-			childregs->UCreg_sp = stack_start;
-
-		if (clone_flags & CLONE_SETTLS)
-			childregs->UCreg_16 = childregs->UCreg_03;
-	}
-	return 0;
-}
-
-/*
- * Fill in the task's elfregs structure for a core dump.
- */
-int dump_task_regs(struct task_struct *t, elf_gregset_t *elfregs)
-{
-	elf_core_copy_regs(elfregs, task_pt_regs(t));
-	return 1;
-}
-
-/*
- * fill in the fpe structure for a core dump...
- */
-int dump_fpu(struct pt_regs *regs, elf_fpregset_t *fp)
-{
-	struct thread_info *thread = current_thread_info();
-	int used_math = thread->used_cp[1] | thread->used_cp[2];
-
-#ifdef CONFIG_UNICORE_FPU_F64
-	if (used_math)
-		memcpy(fp, &thread->fpstate, sizeof(*fp));
-#endif
-	return used_math != 0;
-}
-EXPORT_SYMBOL(dump_fpu);
-
-unsigned long get_wchan(struct task_struct *p)
-{
-	struct stackframe frame;
-	int count = 0;
-	if (!p || p == current || p->state == TASK_RUNNING)
-		return 0;
-
-	frame.fp = thread_saved_fp(p);
-	frame.sp = thread_saved_sp(p);
-	frame.lr = 0;			/* recovered from the stack */
-	frame.pc = thread_saved_pc(p);
-	do {
-		int ret = unwind_frame(&frame);
-		if (ret < 0)
-			return 0;
-		if (!in_sched_functions(frame.pc))
-			return frame.pc;
-	} while ((count++) < 16);
-	return 0;
-}
-
-unsigned long arch_randomize_brk(struct mm_struct *mm)
-{
-	return randomize_page(mm->brk, 0x02000000);
-}
-
-/*
- * The vectors page is always readable from user space for the
- * atomic helpers and the signal restart code.  Let's declare a mapping
- * for it so it is visible through ptrace and /proc/<pid>/mem.
- */
-
-int vectors_user_mapping(void)
-{
-	struct mm_struct *mm = current->mm;
-	return install_special_mapping(mm, 0xffff0000, PAGE_SIZE,
-				       VM_READ | VM_EXEC |
-				       VM_MAYREAD | VM_MAYEXEC |
-				       VM_DONTEXPAND | VM_DONTDUMP,
-				       NULL);
-}
-
-const char *arch_vma_name(struct vm_area_struct *vma)
-{
-	return (vma->vm_start == 0xffff0000) ? "[vectors]" : NULL;
-}
diff --git a/arch/unicore32/kernel/ptrace.c b/arch/unicore32/kernel/ptrace.c
deleted file mode 100644
index 0f216567b90a..000000000000
--- a/arch/unicore32/kernel/ptrace.c
+++ /dev/null
@@ -1,147 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * linux/arch/unicore32/kernel/ptrace.c
- *
- * Code specific to PKUnity SoC and UniCore ISA
- *
- * Copyright (C) 2001-2010 GUAN Xue-tao
- *
- * By Ross Biro 1/23/92
- */
-#include <linux/kernel.h>
-#include <linux/ptrace.h>
-#include <linux/signal.h>
-#include <linux/uaccess.h>
-#include <linux/sched/task_stack.h>
-
-/*
- * this routine will get a word off of the processes privileged stack.
- * the offset is how far from the base addr as stored in the THREAD.
- * this routine assumes that all the privileged stacks are in our
- * data space.
- */
-static inline long get_user_reg(struct task_struct *task, int offset)
-{
-	return task_pt_regs(task)->uregs[offset];
-}
-
-/*
- * this routine will put a word on the processes privileged stack.
- * the offset is how far from the base addr as stored in the THREAD.
- * this routine assumes that all the privileged stacks are in our
- * data space.
- */
-static inline int
-put_user_reg(struct task_struct *task, int offset, long data)
-{
-	struct pt_regs newregs, *regs = task_pt_regs(task);
-	int ret = -EINVAL;
-
-	newregs = *regs;
-	newregs.uregs[offset] = data;
-
-	if (valid_user_regs(&newregs)) {
-		regs->uregs[offset] = data;
-		ret = 0;
-	}
-
-	return ret;
-}
-
-/*
- * Called by kernel/ptrace.c when detaching..
- */
-void ptrace_disable(struct task_struct *child)
-{
-}
-
-/*
- * We actually access the pt_regs stored on the kernel stack.
- */
-static int ptrace_read_user(struct task_struct *tsk, unsigned long off,
-			    unsigned long __user *ret)
-{
-	unsigned long tmp;
-
-	tmp = 0;
-	if (off < sizeof(struct pt_regs))
-		tmp = get_user_reg(tsk, off >> 2);
-
-	return put_user(tmp, ret);
-}
-
-/*
- * We actually access the pt_regs stored on the kernel stack.
- */
-static int ptrace_write_user(struct task_struct *tsk, unsigned long off,
-			     unsigned long val)
-{
-	if (off >= sizeof(struct pt_regs))
-		return 0;
-
-	return put_user_reg(tsk, off >> 2, val);
-}
-
-long arch_ptrace(struct task_struct *child, long request,
-		 unsigned long addr, unsigned long data)
-{
-	int ret;
-	unsigned long __user *datap = (unsigned long __user *) data;
-
-	switch (request) {
-	case PTRACE_PEEKUSR:
-		ret = ptrace_read_user(child, addr, datap);
-		break;
-
-	case PTRACE_POKEUSR:
-		ret = ptrace_write_user(child, addr, data);
-		break;
-
-	case PTRACE_GET_THREAD_AREA:
-		ret = put_user(task_pt_regs(child)->UCreg_16,
-			       datap);
-		break;
-
-	default:
-		ret = ptrace_request(child, request, addr, data);
-		break;
-	}
-
-	return ret;
-}
-
-asmlinkage int syscall_trace(int why, struct pt_regs *regs, int scno)
-{
-	unsigned long ip;
-
-	if (!test_thread_flag(TIF_SYSCALL_TRACE))
-		return scno;
-	if (!(current->ptrace & PT_PTRACED))
-		return scno;
-
-	/*
-	 * Save IP.  IP is used to denote syscall entry/exit:
-	 *  IP = 0 -> entry, = 1 -> exit
-	 */
-	ip = regs->UCreg_ip;
-	regs->UCreg_ip = why;
-
-	current_thread_info()->syscall = scno;
-
-	/* the 0x80 provides a way for the tracing parent to distinguish
-	   between a syscall stop and SIGTRAP delivery */
-	ptrace_notify(SIGTRAP | ((current->ptrace & PT_TRACESYSGOOD)
-				 ? 0x80 : 0));
-	/*
-	 * this isn't the same as continuing with a signal, but it will do
-	 * for normal use.  strace only continues with a signal if the
-	 * stopping signal is not SIGTRAP.  -brl
-	 */
-	if (current->exit_code) {
-		send_sig(current->exit_code, current, 1);
-		current->exit_code = 0;
-	}
-	regs->UCreg_ip = ip;
-
-	return current_thread_info()->syscall;
-}
diff --git a/arch/unicore32/kernel/puv3-core.c b/arch/unicore32/kernel/puv3-core.c
deleted file mode 100644
index 78f12e627365..000000000000
--- a/arch/unicore32/kernel/puv3-core.c
+++ /dev/null
@@ -1,276 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- *  linux/arch/unicore32/kernel/puv3-core.c
- *
- * Code specific to PKUnity SoC and UniCore ISA
- *
- *	Maintained by GUAN Xue-tao <gxt@mprc.pku.edu.cn>
- *	Copyright (C) 2001-2010 Guan Xuetao
- */
-
-#include <linux/init.h>
-#include <linux/device.h>
-#include <linux/amba/bus.h>
-#include <linux/platform_device.h>
-#include <linux/io.h>
-#include <linux/cnt32_to_63.h>
-#include <linux/usb/musb.h>
-
-#include <asm/irq.h>
-#include <mach/hardware.h>
-#include <mach/pm.h>
-
-/*
- * This is the PKUnity sched_clock implementation.  This has
- * a resolution of 271ns, and a maximum value of 32025597s (370 days).
- *
- * The return value is guaranteed to be monotonic in that range as
- * long as there is always less than 582 seconds between successive
- * calls to this function.
- *
- *  ( * 1E9 / CLOCK_TICK_RATE ) -> about 2235/32
- */
-unsigned long long sched_clock(void)
-{
-	unsigned long long v = cnt32_to_63(readl(OST_OSCR));
-
-	/* original conservative method, but overflow frequently
-	 * v *= NSEC_PER_SEC >> 12;
-	 * do_div(v, CLOCK_TICK_RATE >> 12);
-	 */
-	v = ((v & 0x7fffffffffffffffULL) * 2235) >> 5;
-
-	return v;
-}
-
-static struct resource puv3_usb_resources[] = {
-	/* order is significant! */
-	{
-		.start		= io_v2p(PKUNITY_USB_BASE),
-		.end		= io_v2p(PKUNITY_USB_BASE) + 0x3ff,
-		.flags		= IORESOURCE_MEM,
-	}, {
-		.start		= IRQ_USB,
-		.flags		= IORESOURCE_IRQ,
-	}, {
-		.start		= IRQ_USB,
-		.flags		= IORESOURCE_IRQ,
-	},
-};
-
-static struct musb_hdrc_config	puv3_usb_config[] = {
-	{
-		.num_eps = 16,
-		.multipoint = 1,
-#ifdef CONFIG_USB_INVENTRA_DMA
-		.dma = 1,
-		.dma_channels = 8,
-#endif
-	},
-};
-
-static struct musb_hdrc_platform_data puv3_usb_plat = {
-	.mode		= MUSB_HOST,
-	.min_power	= 100,
-	.clock		= 0,
-	.config		= puv3_usb_config,
-};
-
-static struct resource puv3_mmc_resources[] = {
-	[0] = {
-		.start	= io_v2p(PKUNITY_SDC_BASE),
-		.end	= io_v2p(PKUNITY_SDC_BASE) + 0xfff,
-		.flags	= IORESOURCE_MEM,
-	},
-	[1] = {
-		.start	= IRQ_SDC,
-		.end	= IRQ_SDC,
-		.flags	= IORESOURCE_IRQ,
-	},
-};
-
-static struct resource puv3_unigfx_resources[] = {
-	[0] = {
-		.start	= io_v2p(PKUNITY_UNIGFX_BASE),
-		.end	= io_v2p(PKUNITY_UNIGFX_BASE) + 0xfff,
-		.flags	= IORESOURCE_MEM,
-	},
-};
-
-static struct resource puv3_rtc_resources[] = {
-	[0] = {
-		.start = io_v2p(PKUNITY_RTC_BASE),
-		.end   = io_v2p(PKUNITY_RTC_BASE) + 0xff,
-		.flags = IORESOURCE_MEM,
-	},
-	[1] = {
-		.start = IRQ_RTCAlarm,
-		.end   = IRQ_RTCAlarm,
-		.flags = IORESOURCE_IRQ,
-	},
-	[2] = {
-		.start = IRQ_RTC,
-		.end   = IRQ_RTC,
-		.flags = IORESOURCE_IRQ
-	}
-};
-
-static struct resource puv3_pwm_resources[] = {
-	[0] = {
-		.start	= io_v2p(PKUNITY_OST_BASE) + 0x80,
-		.end	= io_v2p(PKUNITY_OST_BASE) + 0xff,
-		.flags	= IORESOURCE_MEM,
-	},
-};
-
-static struct resource puv3_uart0_resources[] = {
-	[0] = {
-		.start = io_v2p(PKUNITY_UART0_BASE),
-		.end   = io_v2p(PKUNITY_UART0_BASE) + 0xff,
-		.flags = IORESOURCE_MEM,
-	},
-	[1] = {
-		.start = IRQ_UART0,
-		.end   = IRQ_UART0,
-		.flags = IORESOURCE_IRQ
-	}
-};
-
-static struct resource puv3_uart1_resources[] = {
-	[0] = {
-		.start = io_v2p(PKUNITY_UART1_BASE),
-		.end   = io_v2p(PKUNITY_UART1_BASE) + 0xff,
-		.flags = IORESOURCE_MEM,
-	},
-	[1] = {
-		.start = IRQ_UART1,
-		.end   = IRQ_UART1,
-		.flags = IORESOURCE_IRQ
-	}
-};
-
-static struct resource puv3_umal_resources[] = {
-	[0] = {
-		.start = io_v2p(PKUNITY_UMAL_BASE),
-		.end   = io_v2p(PKUNITY_UMAL_BASE) + 0x1fff,
-		.flags = IORESOURCE_MEM,
-	},
-	[1] = {
-		.start = IRQ_UMAL,
-		.end   = IRQ_UMAL,
-		.flags = IORESOURCE_IRQ
-	}
-};
-
-#ifdef CONFIG_PUV3_PM
-
-#define SAVE(x)		sleep_save[SLEEP_SAVE_##x] = x
-#define RESTORE(x)	x = sleep_save[SLEEP_SAVE_##x]
-
-/*
- * List of global PXA peripheral registers to preserve.
- * More ones like CP and general purpose register values are preserved
- * with the stack pointer in sleep.S.
- */
-enum {
-	SLEEP_SAVE_PM_PLLDDRCFG,
-	SLEEP_SAVE_COUNT
-};
-
-
-static void puv3_cpu_pm_save(unsigned long *sleep_save)
-{
-/*	SAVE(PM_PLLDDRCFG); */
-}
-
-static void puv3_cpu_pm_restore(unsigned long *sleep_save)
-{
-/*	RESTORE(PM_PLLDDRCFG); */
-}
-
-static int puv3_cpu_pm_prepare(void)
-{
-	/* set resume return address */
-	writel(virt_to_phys(puv3_cpu_resume), PM_DIVCFG);
-	return 0;
-}
-
-static void puv3_cpu_pm_enter(suspend_state_t state)
-{
-	/* Clear reset status */
-	writel(RESETC_RSSR_HWR | RESETC_RSSR_WDR
-			| RESETC_RSSR_SMR | RESETC_RSSR_SWR, RESETC_RSSR);
-
-	switch (state) {
-/*	case PM_SUSPEND_ON:
-		puv3_cpu_idle();
-		break; */
-	case PM_SUSPEND_MEM:
-		puv3_cpu_pm_prepare();
-		puv3_cpu_suspend(PM_PMCR_SFB);
-		break;
-	}
-}
-
-static int puv3_cpu_pm_valid(suspend_state_t state)
-{
-	return state == PM_SUSPEND_MEM;
-}
-
-static void puv3_cpu_pm_finish(void)
-{
-	/* ensure not to come back here if it wasn't intended */
-	/* PSPR = 0; */
-}
-
-static struct puv3_cpu_pm_fns puv3_cpu_pm_fnss = {
-	.save_count	= SLEEP_SAVE_COUNT,
-	.valid		= puv3_cpu_pm_valid,
-	.save		= puv3_cpu_pm_save,
-	.restore	= puv3_cpu_pm_restore,
-	.enter		= puv3_cpu_pm_enter,
-	.prepare	= puv3_cpu_pm_prepare,
-	.finish		= puv3_cpu_pm_finish,
-};
-
-static void __init puv3_init_pm(void)
-{
-	puv3_cpu_pm_fns = &puv3_cpu_pm_fnss;
-}
-#else
-static inline void puv3_init_pm(void) {}
-#endif
-
-void puv3_ps2_init(void)
-{
-	struct clk *bclk32;
-
-	bclk32 = clk_get(NULL, "BUS32_CLK");
-	writel(clk_get_rate(bclk32) / 200000, PS2_CNT); /* should > 5us */
-}
-
-void __init puv3_core_init(void)
-{
-	puv3_init_pm();
-	puv3_ps2_init();
-
-	platform_device_register_simple("PKUnity-v3-RTC", -1,
-			puv3_rtc_resources, ARRAY_SIZE(puv3_rtc_resources));
-	platform_device_register_simple("PKUnity-v3-UMAL", -1,
-			puv3_umal_resources, ARRAY_SIZE(puv3_umal_resources));
-	platform_device_register_simple("PKUnity-v3-MMC", -1,
-			puv3_mmc_resources, ARRAY_SIZE(puv3_mmc_resources));
-	platform_device_register_simple("PKUnity-v3-UNIGFX", -1,
-			puv3_unigfx_resources, ARRAY_SIZE(puv3_unigfx_resources));
-	platform_device_register_simple("PKUnity-v3-PWM", -1,
-			puv3_pwm_resources, ARRAY_SIZE(puv3_pwm_resources));
-	platform_device_register_simple("PKUnity-v3-UART", 0,
-			puv3_uart0_resources, ARRAY_SIZE(puv3_uart0_resources));
-	platform_device_register_simple("PKUnity-v3-UART", 1,
-			puv3_uart1_resources, ARRAY_SIZE(puv3_uart1_resources));
-	platform_device_register_simple("PKUnity-v3-AC97", -1, NULL, 0);
-	platform_device_register_resndata(NULL, "musb_hdrc", -1,
-			puv3_usb_resources, ARRAY_SIZE(puv3_usb_resources),
-			&puv3_usb_plat, sizeof(puv3_usb_plat));
-}
-
diff --git a/arch/unicore32/kernel/puv3-nb0916.c b/arch/unicore32/kernel/puv3-nb0916.c
deleted file mode 100644
index e251f5028396..000000000000
--- a/arch/unicore32/kernel/puv3-nb0916.c
+++ /dev/null
@@ -1,147 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * linux/arch/unicore32/kernel/puv3-nb0916.c
- *
- * Code specific to PKUnity SoC and UniCore ISA
- *
- *	Maintained by GUAN Xue-tao <gxt@mprc.pku.edu.cn>
- *	Copyright (C) 2001-2010 Guan Xuetao
- */
-
-#include <linux/init.h>
-#include <linux/device.h>
-#include <linux/platform_device.h>
-#include <linux/mtd/physmap.h>
-#include <linux/io.h>
-#include <linux/reboot.h>
-#include <linux/interrupt.h>
-#include <linux/i2c.h>
-#include <linux/pwm.h>
-#include <linux/pwm_backlight.h>
-#include <linux/gpio.h>
-#include <linux/gpio_keys.h>
-#include <linux/input.h>
-
-#include <mach/hardware.h>
-
-static struct physmap_flash_data physmap_flash_data = {
-	.width		= 1,
-};
-
-static struct resource physmap_flash_resource = {
-	.start		= 0xFFF80000,
-	.end		= 0xFFFFFFFF,
-	.flags		= IORESOURCE_MEM,
-};
-
-static struct resource puv3_i2c_resources[] = {
-	[0] = {
-		.start = io_v2p(PKUNITY_I2C_BASE),
-		.end   = io_v2p(PKUNITY_I2C_BASE) + 0xff,
-		.flags = IORESOURCE_MEM,
-	},
-	[1] = {
-		.start = IRQ_I2C,
-		.end   = IRQ_I2C,
-		.flags = IORESOURCE_IRQ,
-	}
-};
-
-static struct pwm_lookup nb0916_pwm_lookup[] = {
-	PWM_LOOKUP("PKUnity-v3-PWM", 0, "pwm-backlight", NULL, 70 * 1024,
-		   PWM_POLARITY_NORMAL),
-};
-
-static struct platform_pwm_backlight_data nb0916_backlight_data = {
-	.max_brightness	= 100,
-	.dft_brightness	= 100,
-};
-
-static struct gpio_keys_button nb0916_gpio_keys[] = {
-	{
-		.type	= EV_KEY,
-		.code	= KEY_POWER,
-		.gpio	= GPI_SOFF_REQ,
-		.desc	= "Power Button",
-		.wakeup = 1,
-		.active_low = 1,
-	},
-	{
-		.type	= EV_KEY,
-		.code	= BTN_TOUCH,
-		.gpio	= GPI_BTN_TOUCH,
-		.desc	= "Touchpad Button",
-		.wakeup = 1,
-		.active_low = 1,
-	},
-};
-
-static struct gpio_keys_platform_data nb0916_gpio_button_data = {
-	.buttons	= nb0916_gpio_keys,
-	.nbuttons	= ARRAY_SIZE(nb0916_gpio_keys),
-};
-
-static irqreturn_t nb0916_lcdcaseoff_handler(int irq, void *dev_id)
-{
-	if (gpio_get_value(GPI_LCD_CASE_OFF))
-		gpio_set_value(GPO_LCD_EN, 1);
-	else
-		gpio_set_value(GPO_LCD_EN, 0);
-
-	return IRQ_HANDLED;
-}
-
-static irqreturn_t nb0916_overheat_handler(int irq, void *dev_id)
-{
-	machine_halt();
-	/* SYSTEM HALT, NO RETURN */
-	return IRQ_HANDLED;
-}
-
-static struct i2c_board_info __initdata puv3_i2c_devices[] = {
-	{	I2C_BOARD_INFO("lm75",		I2C_TAR_THERMAL),	},
-	{	I2C_BOARD_INFO("bq27200",	I2C_TAR_PWIC),		},
-	{	I2C_BOARD_INFO("24c02",		I2C_TAR_EEPROM),	},
-};
-
-int __init mach_nb0916_init(void)
-{
-	i2c_register_board_info(0, puv3_i2c_devices,
-			ARRAY_SIZE(puv3_i2c_devices));
-
-	platform_device_register_simple("PKUnity-v3-I2C", -1,
-			puv3_i2c_resources, ARRAY_SIZE(puv3_i2c_resources));
-
-	pwm_add_table(nb0916_pwm_lookup, ARRAY_SIZE(nb0916_pwm_lookup));
-
-	platform_device_register_data(NULL, "pwm-backlight", -1,
-			&nb0916_backlight_data, sizeof(nb0916_backlight_data));
-
-	platform_device_register_data(NULL, "gpio-keys", -1,
-			&nb0916_gpio_button_data, sizeof(nb0916_gpio_button_data));
-
-	platform_device_register_resndata(NULL, "physmap-flash", -1,
-			&physmap_flash_resource, 1,
-			&physmap_flash_data, sizeof(physmap_flash_data));
-
-	if (request_irq(gpio_to_irq(GPI_LCD_CASE_OFF),
-		&nb0916_lcdcaseoff_handler,
-		IRQF_TRIGGER_RISING | IRQF_TRIGGER_FALLING,
-		"NB0916 lcd case off", NULL) < 0) {
-
-		printk(KERN_DEBUG "LCD-Case-OFF IRQ %d not available\n",
-			gpio_to_irq(GPI_LCD_CASE_OFF));
-	}
-
-	if (request_irq(gpio_to_irq(GPI_OTP_INT), &nb0916_overheat_handler,
-		IRQF_TRIGGER_RISING | IRQF_TRIGGER_FALLING,
-		"NB0916 overheating protection", NULL) < 0) {
-
-		printk(KERN_DEBUG "Overheating Protection IRQ %d not available\n",
-			gpio_to_irq(GPI_OTP_INT));
-	}
-
-	return 0;
-}
-
-subsys_initcall_sync(mach_nb0916_init);
diff --git a/arch/unicore32/kernel/setup.c b/arch/unicore32/kernel/setup.c
deleted file mode 100644
index 0c4242a5ee1d..000000000000
--- a/arch/unicore32/kernel/setup.c
+++ /dev/null
@@ -1,352 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * linux/arch/unicore32/kernel/setup.c
- *
- * Code specific to PKUnity SoC and UniCore ISA
- *
- * Copyright (C) 2001-2010 GUAN Xue-tao
- */
-#include <linux/module.h>
-#include <linux/kernel.h>
-#include <linux/stddef.h>
-#include <linux/ioport.h>
-#include <linux/delay.h>
-#include <linux/utsname.h>
-#include <linux/initrd.h>
-#include <linux/console.h>
-#include <linux/memblock.h>
-#include <linux/seq_file.h>
-#include <linux/screen_info.h>
-#include <linux/init.h>
-#include <linux/root_dev.h>
-#include <linux/cpu.h>
-#include <linux/interrupt.h>
-#include <linux/smp.h>
-#include <linux/fs.h>
-#include <linux/proc_fs.h>
-#include <linux/elf.h>
-#include <linux/io.h>
-
-#include <asm/cputype.h>
-#include <asm/sections.h>
-#include <asm/setup.h>
-#include <asm/cacheflush.h>
-#include <asm/tlbflush.h>
-#include <asm/traps.h>
-#include <asm/memblock.h>
-
-#include "setup.h"
-
-#ifndef MEM_SIZE
-#define MEM_SIZE	(16*1024*1024)
-#endif
-
-struct stack {
-	u32 irq[3];
-	u32 abt[3];
-	u32 und[3];
-} ____cacheline_aligned;
-
-static struct stack stacks[NR_CPUS];
-
-#ifdef CONFIG_VGA_CONSOLE
-struct screen_info screen_info;
-#endif
-
-char elf_platform[ELF_PLATFORM_SIZE];
-EXPORT_SYMBOL(elf_platform);
-
-static char __initdata cmd_line[COMMAND_LINE_SIZE];
-
-static char default_command_line[COMMAND_LINE_SIZE] __initdata = CONFIG_CMDLINE;
-
-/*
- * Standard memory resources
- */
-static struct resource mem_res[] = {
-	{
-		.name = "Kernel code",
-		.start = 0,
-		.end = 0,
-		.flags = IORESOURCE_SYSTEM_RAM
-	},
-	{
-		.name = "Kernel data",
-		.start = 0,
-		.end = 0,
-		.flags = IORESOURCE_SYSTEM_RAM
-	}
-};
-
-#define kernel_code mem_res[0]
-#define kernel_data mem_res[1]
-
-/*
- * These functions re-use the assembly code in head.S, which
- * already provide the required functionality.
- */
-static void __init setup_processor(void)
-{
-	printk(KERN_DEFAULT "CPU: UniCore-II [%08x] revision %d, cr=%08lx\n",
-	       uc32_cpuid, (int)(uc32_cpuid >> 16) & 15, cr_alignment);
-
-	sprintf(init_utsname()->machine, "puv3");
-	sprintf(elf_platform, "ucv2");
-}
-
-/*
- * cpu_init - initialise one CPU.
- *
- * cpu_init sets up the per-CPU stacks.
- */
-void cpu_init(void)
-{
-	unsigned int cpu = smp_processor_id();
-	struct stack *stk = &stacks[cpu];
-
-	/*
-	 * setup stacks for re-entrant exception handlers
-	 */
-	__asm__ (
-	"mov.a	asr, %1\n\t"
-	"add	sp, %0, %2\n\t"
-	"mov.a	asr, %3\n\t"
-	"add	sp, %0, %4\n\t"
-	"mov.a	asr, %5\n\t"
-	"add	sp, %0, %6\n\t"
-	"mov.a	asr, %7"
-	    :
-	    : "r" (stk),
-	      "r" (PSR_R_BIT | PSR_I_BIT | INTR_MODE),
-	      "I" (offsetof(struct stack, irq[0])),
-	      "r" (PSR_R_BIT | PSR_I_BIT | ABRT_MODE),
-	      "I" (offsetof(struct stack, abt[0])),
-	      "r" (PSR_R_BIT | PSR_I_BIT | EXTN_MODE),
-	      "I" (offsetof(struct stack, und[0])),
-	      "r" (PSR_R_BIT | PSR_I_BIT | PRIV_MODE)
-	: "r30", "cc");
-}
-
-static int __init uc32_add_memory(unsigned long start, unsigned long size)
-{
-	struct membank *bank = &meminfo.bank[meminfo.nr_banks];
-
-	if (meminfo.nr_banks >= NR_BANKS) {
-		printk(KERN_CRIT "NR_BANKS too low, "
-			"ignoring memory at %#lx\n", start);
-		return -EINVAL;
-	}
-
-	/*
-	 * Ensure that start/size are aligned to a page boundary.
-	 * Size is appropriately rounded down, start is rounded up.
-	 */
-	size -= start & ~PAGE_MASK;
-
-	bank->start = PAGE_ALIGN(start);
-	bank->size  = size & PAGE_MASK;
-
-	/*
-	 * Check whether this memory region has non-zero size or
-	 * invalid node number.
-	 */
-	if (bank->size == 0)
-		return -EINVAL;
-
-	meminfo.nr_banks++;
-	return 0;
-}
-
-/*
- * Pick out the memory size.  We look for mem=size@start,
- * where start and size are "size[KkMm]"
- */
-static int __init early_mem(char *p)
-{
-	static int usermem __initdata = 1;
-	unsigned long size, start;
-	char *endp;
-
-	/*
-	 * If the user specifies memory size, we
-	 * blow away any automatically generated
-	 * size.
-	 */
-	if (usermem) {
-		usermem = 0;
-		meminfo.nr_banks = 0;
-	}
-
-	start = PHYS_OFFSET;
-	size  = memparse(p, &endp);
-	if (*endp == '@')
-		start = memparse(endp + 1, NULL);
-
-	uc32_add_memory(start, size);
-
-	return 0;
-}
-early_param("mem", early_mem);
-
-static void __init
-request_standard_resources(struct meminfo *mi)
-{
-	struct resource *res;
-	int i;
-
-	kernel_code.start   = virt_to_phys(_stext);
-	kernel_code.end     = virt_to_phys(_etext - 1);
-	kernel_data.start   = virt_to_phys(_sdata);
-	kernel_data.end     = virt_to_phys(_end - 1);
-
-	for (i = 0; i < mi->nr_banks; i++) {
-		if (mi->bank[i].size == 0)
-			continue;
-
-		res = memblock_alloc_low(sizeof(*res), SMP_CACHE_BYTES);
-		if (!res)
-			panic("%s: Failed to allocate %zu bytes align=%x\n",
-			      __func__, sizeof(*res), SMP_CACHE_BYTES);
-
-		res->name  = "System RAM";
-		res->start = mi->bank[i].start;
-		res->end   = mi->bank[i].start + mi->bank[i].size - 1;
-		res->flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY;
-
-		request_resource(&iomem_resource, res);
-
-		if (kernel_code.start >= res->start &&
-		    kernel_code.end <= res->end)
-			request_resource(res, &kernel_code);
-		if (kernel_data.start >= res->start &&
-		    kernel_data.end <= res->end)
-			request_resource(res, &kernel_data);
-	}
-}
-
-static void (*init_machine)(void) __initdata;
-
-static int __init customize_machine(void)
-{
-	/* customizes platform devices, or adds new ones */
-	if (init_machine)
-		init_machine();
-	return 0;
-}
-arch_initcall(customize_machine);
-
-void __init setup_arch(char **cmdline_p)
-{
-	char *from = default_command_line;
-
-	setup_processor();
-
-	init_mm.start_code = (unsigned long) _stext;
-	init_mm.end_code   = (unsigned long) _etext;
-	init_mm.end_data   = (unsigned long) _edata;
-	init_mm.brk	   = (unsigned long) _end;
-
-	/* parse_early_param needs a boot_command_line */
-	strlcpy(boot_command_line, from, COMMAND_LINE_SIZE);
-
-	/* populate cmd_line too for later use, preserving boot_command_line */
-	strlcpy(cmd_line, boot_command_line, COMMAND_LINE_SIZE);
-	*cmdline_p = cmd_line;
-
-	parse_early_param();
-
-	uc32_memblock_init(&meminfo);
-
-	paging_init();
-	request_standard_resources(&meminfo);
-
-	cpu_init();
-
-	/*
-	 * Set up various architecture-specific pointers
-	 */
-	init_machine = puv3_core_init;
-
-#ifdef CONFIG_VT
-#if defined(CONFIG_VGA_CONSOLE)
-	conswitchp = &vga_con;
-#endif
-#endif
-	early_trap_init();
-}
-
-static struct cpu cpuinfo_unicore;
-
-static int __init topology_init(void)
-{
-	int i;
-
-	for_each_possible_cpu(i)
-		register_cpu(&cpuinfo_unicore, i);
-
-	return 0;
-}
-subsys_initcall(topology_init);
-
-#ifdef CONFIG_HAVE_PROC_CPU
-static int __init proc_cpu_init(void)
-{
-	struct proc_dir_entry *res;
-
-	res = proc_mkdir("cpu", NULL);
-	if (!res)
-		return -ENOMEM;
-	return 0;
-}
-fs_initcall(proc_cpu_init);
-#endif
-
-static int c_show(struct seq_file *m, void *v)
-{
-	seq_printf(m, "Processor\t: UniCore-II rev %d (%s)\n",
-		   (int)(uc32_cpuid >> 16) & 15, elf_platform);
-
-	seq_printf(m, "BogoMIPS\t: %lu.%02lu\n",
-		   loops_per_jiffy / (500000/HZ),
-		   (loops_per_jiffy / (5000/HZ)) % 100);
-
-	/* dump out the processor features */
-	seq_puts(m, "Features\t: CMOV UC-F64");
-
-	seq_printf(m, "\nCPU implementer\t: 0x%02x\n", uc32_cpuid >> 24);
-	seq_printf(m, "CPU architecture: 2\n");
-	seq_printf(m, "CPU revision\t: %d\n", (uc32_cpuid >> 16) & 15);
-
-	seq_printf(m, "Cache type\t: write-back\n"
-			"Cache clean\t: cp0 c5 ops\n"
-			"Cache lockdown\t: not support\n"
-			"Cache format\t: Harvard\n");
-
-	seq_puts(m, "\n");
-
-	seq_printf(m, "Hardware\t: PKUnity v3\n");
-
-	return 0;
-}
-
-static void *c_start(struct seq_file *m, loff_t *pos)
-{
-	return *pos < 1 ? (void *)1 : NULL;
-}
-
-static void *c_next(struct seq_file *m, void *v, loff_t *pos)
-{
-	++*pos;
-	return NULL;
-}
-
-static void c_stop(struct seq_file *m, void *v)
-{
-}
-
-const struct seq_operations cpuinfo_op = {
-	.start	= c_start,
-	.next	= c_next,
-	.stop	= c_stop,
-	.show	= c_show
-};
diff --git a/arch/unicore32/kernel/setup.h b/arch/unicore32/kernel/setup.h
deleted file mode 100644
index 967352323185..000000000000
--- a/arch/unicore32/kernel/setup.h
+++ /dev/null
@@ -1,36 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/*
- * linux/arch/unicore32/kernel/setup.h
- *
- * Code specific to PKUnity SoC and UniCore ISA
- *
- * Copyright (C) 2001-2010 GUAN Xue-tao
- */
-#ifndef __UNICORE_KERNEL_SETUP_H__
-#define __UNICORE_KERNEL_SETUP_H__
-
-#include <asm/hwdef-copro.h>
-
-extern void paging_init(void);
-extern void puv3_core_init(void);
-extern void cpu_init(void);
-
-extern void puv3_ps2_init(void);
-extern void pci_puv3_preinit(void);
-extern void __init puv3_init_gpio(void);
-
-extern void setup_mm_for_reboot(void);
-
-extern char __stubs_start[], __stubs_end[];
-extern char __vectors_start[], __vectors_end[];
-
-extern void kernel_thread_helper(void);
-
-extern void __init early_signal_init(void);
-
-extern asmlinkage void __backtrace(void);
-extern asmlinkage void c_backtrace(unsigned long fp, const char *loglvl);
-
-extern void __show_regs(struct pt_regs *);
-
-#endif
diff --git a/arch/unicore32/kernel/signal.c b/arch/unicore32/kernel/signal.c
deleted file mode 100644
index 3946182a835d..000000000000
--- a/arch/unicore32/kernel/signal.c
+++ /dev/null
@@ -1,424 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * linux/arch/unicore32/kernel/signal.c
- *
- * Code specific to PKUnity SoC and UniCore ISA
- *
- * Copyright (C) 2001-2010 GUAN Xue-tao
- */
-#include <linux/errno.h>
-#include <linux/signal.h>
-#include <linux/personality.h>
-#include <linux/uaccess.h>
-#include <linux/tracehook.h>
-#include <linux/elf.h>
-#include <linux/unistd.h>
-
-#include <asm/cacheflush.h>
-#include <asm/ucontext.h>
-
-/*
- * For UniCore syscalls, we encode the syscall number into the instruction.
- */
-#define SWI_SYS_SIGRETURN	(0xff000000) /* error number for new abi */
-#define SWI_SYS_RT_SIGRETURN	(0xff000000 | (__NR_rt_sigreturn))
-#define SWI_SYS_RESTART		(0xff000000 | (__NR_restart_syscall))
-
-#define KERN_SIGRETURN_CODE	(KUSER_VECPAGE_BASE + 0x00000500)
-#define KERN_RESTART_CODE	(KERN_SIGRETURN_CODE + sizeof(sigreturn_codes))
-
-const unsigned long sigreturn_codes[3] = {
-	SWI_SYS_SIGRETURN, SWI_SYS_RT_SIGRETURN,
-};
-
-const unsigned long syscall_restart_code[2] = {
-	SWI_SYS_RESTART,	/* swi	__NR_restart_syscall */
-	0x69efc004,		/* ldr	pc, [sp], #4 */
-};
-
-/*
- * Do a signal return; undo the signal stack.  These are aligned to 64-bit.
- */
-struct sigframe {
-	struct ucontext uc;
-	unsigned long retcode[2];
-};
-
-struct rt_sigframe {
-	struct siginfo info;
-	struct sigframe sig;
-};
-
-static int restore_sigframe(struct pt_regs *regs, struct sigframe __user *sf)
-{
-	sigset_t set;
-	int err;
-
-	err = __copy_from_user(&set, &sf->uc.uc_sigmask, sizeof(set));
-	if (err == 0)
-		set_current_blocked(&set);
-
-	err |= __get_user(regs->UCreg_00, &sf->uc.uc_mcontext.regs.UCreg_00);
-	err |= __get_user(regs->UCreg_01, &sf->uc.uc_mcontext.regs.UCreg_01);
-	err |= __get_user(regs->UCreg_02, &sf->uc.uc_mcontext.regs.UCreg_02);
-	err |= __get_user(regs->UCreg_03, &sf->uc.uc_mcontext.regs.UCreg_03);
-	err |= __get_user(regs->UCreg_04, &sf->uc.uc_mcontext.regs.UCreg_04);
-	err |= __get_user(regs->UCreg_05, &sf->uc.uc_mcontext.regs.UCreg_05);
-	err |= __get_user(regs->UCreg_06, &sf->uc.uc_mcontext.regs.UCreg_06);
-	err |= __get_user(regs->UCreg_07, &sf->uc.uc_mcontext.regs.UCreg_07);
-	err |= __get_user(regs->UCreg_08, &sf->uc.uc_mcontext.regs.UCreg_08);
-	err |= __get_user(regs->UCreg_09, &sf->uc.uc_mcontext.regs.UCreg_09);
-	err |= __get_user(regs->UCreg_10, &sf->uc.uc_mcontext.regs.UCreg_10);
-	err |= __get_user(regs->UCreg_11, &sf->uc.uc_mcontext.regs.UCreg_11);
-	err |= __get_user(regs->UCreg_12, &sf->uc.uc_mcontext.regs.UCreg_12);
-	err |= __get_user(regs->UCreg_13, &sf->uc.uc_mcontext.regs.UCreg_13);
-	err |= __get_user(regs->UCreg_14, &sf->uc.uc_mcontext.regs.UCreg_14);
-	err |= __get_user(regs->UCreg_15, &sf->uc.uc_mcontext.regs.UCreg_15);
-	err |= __get_user(regs->UCreg_16, &sf->uc.uc_mcontext.regs.UCreg_16);
-	err |= __get_user(regs->UCreg_17, &sf->uc.uc_mcontext.regs.UCreg_17);
-	err |= __get_user(regs->UCreg_18, &sf->uc.uc_mcontext.regs.UCreg_18);
-	err |= __get_user(regs->UCreg_19, &sf->uc.uc_mcontext.regs.UCreg_19);
-	err |= __get_user(regs->UCreg_20, &sf->uc.uc_mcontext.regs.UCreg_20);
-	err |= __get_user(regs->UCreg_21, &sf->uc.uc_mcontext.regs.UCreg_21);
-	err |= __get_user(regs->UCreg_22, &sf->uc.uc_mcontext.regs.UCreg_22);
-	err |= __get_user(regs->UCreg_23, &sf->uc.uc_mcontext.regs.UCreg_23);
-	err |= __get_user(regs->UCreg_24, &sf->uc.uc_mcontext.regs.UCreg_24);
-	err |= __get_user(regs->UCreg_25, &sf->uc.uc_mcontext.regs.UCreg_25);
-	err |= __get_user(regs->UCreg_26, &sf->uc.uc_mcontext.regs.UCreg_26);
-	err |= __get_user(regs->UCreg_fp, &sf->uc.uc_mcontext.regs.UCreg_fp);
-	err |= __get_user(regs->UCreg_ip, &sf->uc.uc_mcontext.regs.UCreg_ip);
-	err |= __get_user(regs->UCreg_sp, &sf->uc.uc_mcontext.regs.UCreg_sp);
-	err |= __get_user(regs->UCreg_lr, &sf->uc.uc_mcontext.regs.UCreg_lr);
-	err |= __get_user(regs->UCreg_pc, &sf->uc.uc_mcontext.regs.UCreg_pc);
-	err |= __get_user(regs->UCreg_asr, &sf->uc.uc_mcontext.regs.UCreg_asr);
-
-	err |= !valid_user_regs(regs);
-
-	return err;
-}
-
-asmlinkage int __sys_rt_sigreturn(struct pt_regs *regs)
-{
-	struct rt_sigframe __user *frame;
-
-	/* Always make any pending restarted system calls return -EINTR */
-	current->restart_block.fn = do_no_restart_syscall;
-
-	/*
-	 * Since we stacked the signal on a 64-bit boundary,
-	 * then 'sp' should be word aligned here.  If it's
-	 * not, then the user is trying to mess with us.
-	 */
-	if (regs->UCreg_sp & 7)
-		goto badframe;
-
-	frame = (struct rt_sigframe __user *)regs->UCreg_sp;
-
-	if (!access_ok(frame, sizeof(*frame)))
-		goto badframe;
-
-	if (restore_sigframe(regs, &frame->sig))
-		goto badframe;
-
-	if (restore_altstack(&frame->sig.uc.uc_stack))
-		goto badframe;
-
-	return regs->UCreg_00;
-
-badframe:
-	force_sig(SIGSEGV);
-	return 0;
-}
-
-static int setup_sigframe(struct sigframe __user *sf, struct pt_regs *regs,
-		sigset_t *set)
-{
-	int err = 0;
-
-	err |= __put_user(regs->UCreg_00, &sf->uc.uc_mcontext.regs.UCreg_00);
-	err |= __put_user(regs->UCreg_01, &sf->uc.uc_mcontext.regs.UCreg_01);
-	err |= __put_user(regs->UCreg_02, &sf->uc.uc_mcontext.regs.UCreg_02);
-	err |= __put_user(regs->UCreg_03, &sf->uc.uc_mcontext.regs.UCreg_03);
-	err |= __put_user(regs->UCreg_04, &sf->uc.uc_mcontext.regs.UCreg_04);
-	err |= __put_user(regs->UCreg_05, &sf->uc.uc_mcontext.regs.UCreg_05);
-	err |= __put_user(regs->UCreg_06, &sf->uc.uc_mcontext.regs.UCreg_06);
-	err |= __put_user(regs->UCreg_07, &sf->uc.uc_mcontext.regs.UCreg_07);
-	err |= __put_user(regs->UCreg_08, &sf->uc.uc_mcontext.regs.UCreg_08);
-	err |= __put_user(regs->UCreg_09, &sf->uc.uc_mcontext.regs.UCreg_09);
-	err |= __put_user(regs->UCreg_10, &sf->uc.uc_mcontext.regs.UCreg_10);
-	err |= __put_user(regs->UCreg_11, &sf->uc.uc_mcontext.regs.UCreg_11);
-	err |= __put_user(regs->UCreg_12, &sf->uc.uc_mcontext.regs.UCreg_12);
-	err |= __put_user(regs->UCreg_13, &sf->uc.uc_mcontext.regs.UCreg_13);
-	err |= __put_user(regs->UCreg_14, &sf->uc.uc_mcontext.regs.UCreg_14);
-	err |= __put_user(regs->UCreg_15, &sf->uc.uc_mcontext.regs.UCreg_15);
-	err |= __put_user(regs->UCreg_16, &sf->uc.uc_mcontext.regs.UCreg_16);
-	err |= __put_user(regs->UCreg_17, &sf->uc.uc_mcontext.regs.UCreg_17);
-	err |= __put_user(regs->UCreg_18, &sf->uc.uc_mcontext.regs.UCreg_18);
-	err |= __put_user(regs->UCreg_19, &sf->uc.uc_mcontext.regs.UCreg_19);
-	err |= __put_user(regs->UCreg_20, &sf->uc.uc_mcontext.regs.UCreg_20);
-	err |= __put_user(regs->UCreg_21, &sf->uc.uc_mcontext.regs.UCreg_21);
-	err |= __put_user(regs->UCreg_22, &sf->uc.uc_mcontext.regs.UCreg_22);
-	err |= __put_user(regs->UCreg_23, &sf->uc.uc_mcontext.regs.UCreg_23);
-	err |= __put_user(regs->UCreg_24, &sf->uc.uc_mcontext.regs.UCreg_24);
-	err |= __put_user(regs->UCreg_25, &sf->uc.uc_mcontext.regs.UCreg_25);
-	err |= __put_user(regs->UCreg_26, &sf->uc.uc_mcontext.regs.UCreg_26);
-	err |= __put_user(regs->UCreg_fp, &sf->uc.uc_mcontext.regs.UCreg_fp);
-	err |= __put_user(regs->UCreg_ip, &sf->uc.uc_mcontext.regs.UCreg_ip);
-	err |= __put_user(regs->UCreg_sp, &sf->uc.uc_mcontext.regs.UCreg_sp);
-	err |= __put_user(regs->UCreg_lr, &sf->uc.uc_mcontext.regs.UCreg_lr);
-	err |= __put_user(regs->UCreg_pc, &sf->uc.uc_mcontext.regs.UCreg_pc);
-	err |= __put_user(regs->UCreg_asr, &sf->uc.uc_mcontext.regs.UCreg_asr);
-
-	err |= __put_user(current->thread.trap_no,
-			&sf->uc.uc_mcontext.trap_no);
-	err |= __put_user(current->thread.error_code,
-			&sf->uc.uc_mcontext.error_code);
-	err |= __put_user(current->thread.address,
-			&sf->uc.uc_mcontext.fault_address);
-	err |= __put_user(set->sig[0], &sf->uc.uc_mcontext.oldmask);
-
-	err |= __copy_to_user(&sf->uc.uc_sigmask, set, sizeof(*set));
-
-	return err;
-}
-
-static inline void __user *get_sigframe(struct k_sigaction *ka,
-		struct pt_regs *regs, int framesize)
-{
-	unsigned long sp = regs->UCreg_sp;
-	void __user *frame;
-
-	/*
-	 * This is the X/Open sanctioned signal stack switching.
-	 */
-	if ((ka->sa.sa_flags & SA_ONSTACK) && !sas_ss_flags(sp))
-		sp = current->sas_ss_sp + current->sas_ss_size;
-
-	/*
-	 * ATPCS B01 mandates 8-byte alignment
-	 */
-	frame = (void __user *)((sp - framesize) & ~7);
-
-	/*
-	 * Check that we can actually write to the signal frame.
-	 */
-	if (!access_ok(frame, framesize))
-		frame = NULL;
-
-	return frame;
-}
-
-static int setup_return(struct pt_regs *regs, struct k_sigaction *ka,
-	     unsigned long __user *rc, void __user *frame, int usig)
-{
-	unsigned long handler = (unsigned long)ka->sa.sa_handler;
-	unsigned long retcode;
-	unsigned long asr = regs->UCreg_asr & ~PSR_f;
-
-	unsigned int idx = 0;
-
-	if (ka->sa.sa_flags & SA_SIGINFO)
-		idx += 1;
-
-	if (__put_user(sigreturn_codes[idx],   rc) ||
-	    __put_user(sigreturn_codes[idx+1], rc+1))
-		return 1;
-
-	retcode = KERN_SIGRETURN_CODE + (idx << 2);
-
-	regs->UCreg_00 = usig;
-	regs->UCreg_sp = (unsigned long)frame;
-	regs->UCreg_lr = retcode;
-	regs->UCreg_pc = handler;
-	regs->UCreg_asr = asr;
-
-	return 0;
-}
-
-static int setup_frame(struct ksignal *ksig, sigset_t *set,
-		       struct pt_regs *regs)
-{
-	struct sigframe __user *frame = get_sigframe(&ksig->ka, regs, sizeof(*frame));
-	int err = 0;
-
-	if (!frame)
-		return 1;
-
-	/*
-	 * Set uc.uc_flags to a value which sc.trap_no would never have.
-	 */
-	err |= __put_user(0x5ac3c35a, &frame->uc.uc_flags);
-
-	err |= setup_sigframe(frame, regs, set);
-	if (err == 0)
-		err |= setup_return(regs, &ksig->ka, frame->retcode, frame,
-				    ksig->sig);
-
-	return err;
-}
-
-static int setup_rt_frame(struct ksignal *ksig, sigset_t *set,
-			  struct pt_regs *regs)
-{
-	struct rt_sigframe __user *frame =
-			get_sigframe(&ksig->ka, regs, sizeof(*frame));
-	int err = 0;
-
-	if (!frame)
-		return 1;
-
-	err |= copy_siginfo_to_user(&frame->info, &ksig->info);
-
-	err |= __put_user(0, &frame->sig.uc.uc_flags);
-	err |= __put_user(NULL, &frame->sig.uc.uc_link);
-	err |= __save_altstack(&frame->sig.uc.uc_stack, regs->UCreg_sp);
-	err |= setup_sigframe(&frame->sig, regs, set);
-	if (err == 0)
-		err |= setup_return(regs, &ksig->ka, frame->sig.retcode, frame,
-				    ksig->sig);
-
-	if (err == 0) {
-		/*
-		 * For realtime signals we must also set the second and third
-		 * arguments for the signal handler.
-		 */
-		regs->UCreg_01 = (unsigned long)&frame->info;
-		regs->UCreg_02 = (unsigned long)&frame->sig.uc;
-	}
-
-	return err;
-}
-
-static inline void setup_syscall_restart(struct pt_regs *regs)
-{
-	regs->UCreg_00 = regs->UCreg_ORIG_00;
-	regs->UCreg_pc -= 4;
-}
-
-/*
- * OK, we're invoking a handler
- */
-static void handle_signal(struct ksignal *ksig, struct pt_regs *regs,
-			  int syscall)
-{
-	struct thread_info *thread = current_thread_info();
-	sigset_t *oldset = sigmask_to_save();
-	int usig = ksig->sig;
-	int ret;
-
-	/*
-	 * If we were from a system call, check for system call restarting...
-	 */
-	if (syscall) {
-		switch (regs->UCreg_00) {
-		case -ERESTART_RESTARTBLOCK:
-		case -ERESTARTNOHAND:
-			regs->UCreg_00 = -EINTR;
-			break;
-		case -ERESTARTSYS:
-			if (!(ksig->ka.sa.sa_flags & SA_RESTART)) {
-				regs->UCreg_00 = -EINTR;
-				break;
-			}
-			/* fallthrough */
-		case -ERESTARTNOINTR:
-			setup_syscall_restart(regs);
-		}
-	}
-
-	/*
-	 * Set up the stack frame
-	 */
-	if (ksig->ka.sa.sa_flags & SA_SIGINFO)
-		ret = setup_rt_frame(ksig, oldset, regs);
-	else
-		ret = setup_frame(ksig, oldset, regs);
-
-	/*
-	 * Check that the resulting registers are actually sane.
-	 */
-	ret |= !valid_user_regs(regs);
-
-	signal_setup_done(ret, ksig, 0);
-}
-
-/*
- * Note that 'init' is a special process: it doesn't get signals it doesn't
- * want to handle. Thus you cannot kill init even with a SIGKILL even by
- * mistake.
- *
- * Note that we go through the signals twice: once to check the signals that
- * the kernel can handle, and then we build all the user-level signal handling
- * stack-frames in one go after that.
- */
-static void do_signal(struct pt_regs *regs, int syscall)
-{
-	struct ksignal ksig;
-
-	/*
-	 * We want the common case to go fast, which
-	 * is why we may in certain cases get here from
-	 * kernel mode. Just return without doing anything
-	 * if so.
-	 */
-	if (!user_mode(regs))
-		return;
-
-	if (get_signal(&ksig)) {
-		handle_signal(&ksig, regs, syscall);
-		return;
-	}
-
-	/*
-	 * No signal to deliver to the process - restart the syscall.
-	 */
-	if (syscall) {
-		if (regs->UCreg_00 == -ERESTART_RESTARTBLOCK) {
-				u32 __user *usp;
-
-				regs->UCreg_sp -= 4;
-				usp = (u32 __user *)regs->UCreg_sp;
-
-				if (put_user(regs->UCreg_pc, usp) == 0) {
-					regs->UCreg_pc = KERN_RESTART_CODE;
-				} else {
-					regs->UCreg_sp += 4;
-					force_sigsegv(0);
-				}
-		}
-		if (regs->UCreg_00 == -ERESTARTNOHAND ||
-		    regs->UCreg_00 == -ERESTARTSYS ||
-		    regs->UCreg_00 == -ERESTARTNOINTR) {
-			setup_syscall_restart(regs);
-		}
-	}
-	/* If there's no signal to deliver, we just put the saved
-	 * sigmask back.
-	 */
-	restore_saved_sigmask();
-}
-
-asmlinkage void do_notify_resume(struct pt_regs *regs,
-		unsigned int thread_flags, int syscall)
-{
-	if (thread_flags & _TIF_SIGPENDING)
-		do_signal(regs, syscall);
-
-	if (thread_flags & _TIF_NOTIFY_RESUME) {
-		clear_thread_flag(TIF_NOTIFY_RESUME);
-		tracehook_notify_resume(regs);
-	}
-}
-
-/*
- * Copy signal return handlers into the vector page, and
- * set sigreturn to be a pointer to these.
- */
-void __init early_signal_init(void)
-{
-	memcpy((void *)kuser_vecpage_to_vectors(KERN_SIGRETURN_CODE),
-			sigreturn_codes, sizeof(sigreturn_codes));
-	memcpy((void *)kuser_vecpage_to_vectors(KERN_RESTART_CODE),
-			syscall_restart_code, sizeof(syscall_restart_code));
-	/* Need not to flush icache, since early_trap_init will do it last. */
-}
diff --git a/arch/unicore32/kernel/sleep.S b/arch/unicore32/kernel/sleep.S
deleted file mode 100644
index 23151abe53c6..000000000000
--- a/arch/unicore32/kernel/sleep.S
+++ /dev/null
@@ -1,199 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/*
- * linux/arch/unicore32/kernel/sleep.S
- *
- * Code specific to PKUnity SoC and UniCore ISA
- *
- *	Maintained by GUAN Xue-tao <gxt@mprc.pku.edu.cn>
- *	Copyright (C) 2001-2010 Guan Xuetao
- */
-
-#include <linux/linkage.h>
-#include <asm/assembler.h>
-#include <mach/hardware.h>
-
-		.text
-
-pkunity_cpu_save_cp:
-
-	@ get coprocessor registers
-
-	movc	r3, p0.c7, #0			@ PID
-	movc	r4, p0.c2, #0			@ translation table base addr
-	movc	r5, p0.c1, #0			@ control reg
-
-
-	@ store them plus current virtual stack ptr on stack
-	mov	r6, sp
-	stm.w	(r3 - r6), [sp-]
-
-	mov	pc, lr
-
-pkunity_cpu_save_sp:
-	@ preserve phys address of stack
-	mov	r0, sp
-	stw.w	lr, [sp+], #-4
-	b.l	sleep_phys_sp
-	ldw	r1, =sleep_save_sp
-	stw	r0, [r1]
-	ldw.w	pc, [sp]+, #4
-
-/*
- * puv3_cpu_suspend()
- *
- * Forces CPU into sleep state.
- *
- * r0 = value for PWRMODE M field for desired sleep state
- */
-
-ENTRY(puv3_cpu_suspend)
-	stm.w	(r16 - r27, lr), [sp-]		@ save registers on stack
-	stm.w	(r4 - r15), [sp-]		@ save registers on stack
-
-#ifdef	CONFIG_UNICORE_FPU_F64
-	sfm.w	(f0  - f7 ), [sp-]
-	sfm.w	(f8  - f15), [sp-]
-	sfm.w	(f16 - f23), [sp-]
-	sfm.w	(f24 - f31), [sp-]
-	cff	r4, s31
-	stm.w	(r4), [sp-]
-#endif
-	b.l	pkunity_cpu_save_cp
-
-	b.l	pkunity_cpu_save_sp
-
-	@ clean data cache
-	mov	r1, #0
-	movc	p0.c5, r1, #14
-	nop
-	nop
-	nop
-	nop
-
-
-
-	@ DDR2 BaseAddr
-	ldw	r0, =(PKUNITY_DDR2CTRL_BASE)
-
-	@ PM BaseAddr
-	ldw	r1, =(PKUNITY_PM_BASE)
-
-	@ set PLL_SYS_CFG reg, 275
-	movl	r6, #0x00002401
-	stw	r6, [r1+], #0x18
-	@ set PLL_DDR_CFG reg, 66MHz
-	movl	r6, #0x00100c00
-	stw	r6, [r1+], #0x1c
-
-	@ set wake up source
-	movl	r8, #0x800001ff		@ epip4d
-	stw	r8, [r1+], #0xc
-
-	@ set PGSR
-	movl	r5, #0x40000
-	stw	r5, [r1+], #0x10
-
-	@ prepare DDR2 refresh settings
-	ldw	r5, [r0+], #0x24
-	or	r5, r5, #0x00000001
-
-	@ prepare PMCR for PLL changing
-	movl	r6, #0xc
-
-	@ prepare for closing PLL
-	movl	r7, #0x1
-
-	@ prepare sleep mode
-	mov	r8, #0x1
-
-@	movl	r0, 0x11111111
-@	put_word_ocd r0
-	b	pkunity_cpu_do_suspend
-
-	.ltorg
-	.align	5
-pkunity_cpu_do_suspend:
-	b	101f
-	@ put DDR2 into self-refresh
-100:	stw	r5, [r0+], #0x24
-	@ change PLL
-	stw	r6, [r1]
-	b	1f
-
-	.ltorg
-	.align	5
-101:	b	102f
-	@ wait for PLL changing complete
-1:	ldw	r6, [r1+], #0x44
-	csub.a	r6, #0x1
-	bne	1b
-	b	2f
-
-	.ltorg
-	.align	5
-102:	b	100b
-	@ close PLL
-2:	stw	r7, [r1+], #0x4
-	@ enter sleep mode
-	stw	r8, [r1]
-3:	b	3b
-
-
-
-
-/*
- * puv3_cpu_resume()
- *
- * entry point from bootloader into kernel during resume
- *
- * Note: Yes, part of the following code is located into the .data section.
- *       This is to allow sleep_save_sp to be accessed with a relative load
- *       while we can't rely on any MMU translation.  We could have put
- *       sleep_save_sp in the .text section as well, but some setups might
- *       insist on it to be truly read-only.
- */
-
-	.data
-	.align 5
-ENTRY(puv3_cpu_resume)
-@	movl	r0, 0x20202020
-@	put_word_ocd r0
-
-	ldw	r0, sleep_save_sp		@ stack phys addr
-	ldw	r2, =resume_after_mmu		@ its absolute virtual address
-	ldm	(r3 - r6), [r0]+		@ CP regs + virt stack ptr
-	mov	sp, r6				@ CP regs + virt stack ptr
-
-	mov	r1, #0
-	movc	p0.c6, r1, #6			@ invalidate I & D TLBs
-	movc	p0.c5, r1, #28			@ invalidate I & D caches, BTB
-
-	movc	p0.c7, r3, #0			@ PID
-	movc	p0.c2, r4, #0			@ translation table base addr
-	movc	p0.c1, r5, #0			@ control reg, turn on mmu
-	nop
-	jump	r2
-	nop
-	nop
-	nop
-	nop
-	nop
-
-sleep_save_sp:
-	.word	0				@ preserve stack phys ptr here
-
-	.text
-resume_after_mmu:
-@	movl	r0, 0x30303030
-@	put_word_ocd r0
-
-#ifdef	CONFIG_UNICORE_FPU_F64
-	lfm.w	(f0  - f7 ), [sp]+
-	lfm.w	(f8  - f15), [sp]+
-	lfm.w	(f16 - f23), [sp]+
-	lfm.w	(f24 - f31), [sp]+
-	ldm.w	(r4), [sp]+
-	ctf	r4, s31
-#endif
-	ldm.w	(r4 - r15), [sp]+		@ restore registers from stack
-	ldm.w	(r16 - r27, pc), [sp]+		@ return to caller
diff --git a/arch/unicore32/kernel/stacktrace.c b/arch/unicore32/kernel/stacktrace.c
deleted file mode 100644
index c9d8650e9d78..000000000000
--- a/arch/unicore32/kernel/stacktrace.c
+++ /dev/null
@@ -1,127 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * linux/arch/unicore32/kernel/stacktrace.c
- *
- * Code specific to PKUnity SoC and UniCore ISA
- *
- * Copyright (C) 2001-2010 GUAN Xue-tao
- */
-#include <linux/module.h>
-#include <linux/sched.h>
-#include <linux/sched/debug.h>
-#include <linux/stacktrace.h>
-
-#include <asm/stacktrace.h>
-
-#if defined(CONFIG_FRAME_POINTER)
-/*
- * Unwind the current stack frame and store the new register values in the
- * structure passed as argument. Unwinding is equivalent to a function return,
- * hence the new PC value rather than LR should be used for backtrace.
- *
- * With framepointer enabled, a simple function prologue looks like this:
- *	mov	ip, sp
- *	stmdb	sp!, {fp, ip, lr, pc}
- *	sub	fp, ip, #4
- *
- * A simple function epilogue looks like this:
- *	ldm	sp, {fp, sp, pc}
- *
- * Note that with framepointer enabled, even the leaf functions have the same
- * prologue and epilogue, therefore we can ignore the LR value in this case.
- */
-int notrace unwind_frame(struct stackframe *frame)
-{
-	unsigned long high, low;
-	unsigned long fp = frame->fp;
-
-	/* only go to a higher address on the stack */
-	low = frame->sp;
-	high = ALIGN(low, THREAD_SIZE);
-
-	/* check current frame pointer is within bounds */
-	if (fp < (low + 12) || fp + 4 >= high)
-		return -EINVAL;
-
-	/* restore the registers from the stack frame */
-	frame->fp = *(unsigned long *)(fp - 12);
-	frame->sp = *(unsigned long *)(fp - 8);
-	frame->pc = *(unsigned long *)(fp - 4);
-
-	return 0;
-}
-#endif
-
-void notrace walk_stackframe(struct stackframe *frame,
-		     int (*fn)(struct stackframe *, void *), void *data)
-{
-	while (1) {
-		int ret;
-
-		if (fn(frame, data))
-			break;
-		ret = unwind_frame(frame);
-		if (ret < 0)
-			break;
-	}
-}
-EXPORT_SYMBOL(walk_stackframe);
-
-#ifdef CONFIG_STACKTRACE
-struct stack_trace_data {
-	struct stack_trace *trace;
-	unsigned int no_sched_functions;
-	unsigned int skip;
-};
-
-static int save_trace(struct stackframe *frame, void *d)
-{
-	struct stack_trace_data *data = d;
-	struct stack_trace *trace = data->trace;
-	unsigned long addr = frame->pc;
-
-	if (data->no_sched_functions && in_sched_functions(addr))
-		return 0;
-	if (data->skip) {
-		data->skip--;
-		return 0;
-	}
-
-	trace->entries[trace->nr_entries++] = addr;
-
-	return trace->nr_entries >= trace->max_entries;
-}
-
-void save_stack_trace_tsk(struct task_struct *tsk, struct stack_trace *trace)
-{
-	struct stack_trace_data data;
-	struct stackframe frame;
-
-	data.trace = trace;
-	data.skip = trace->skip;
-
-	if (tsk != current) {
-		data.no_sched_functions = 1;
-		frame.fp = thread_saved_fp(tsk);
-		frame.sp = thread_saved_sp(tsk);
-		frame.lr = 0;		/* recovered from the stack */
-		frame.pc = thread_saved_pc(tsk);
-	} else {
-		register unsigned long current_sp asm("sp");
-
-		data.no_sched_functions = 0;
-		frame.fp = (unsigned long)__builtin_frame_address(0);
-		frame.sp = current_sp;
-		frame.lr = (unsigned long)__builtin_return_address(0);
-		frame.pc = (unsigned long)save_stack_trace_tsk;
-	}
-
-	walk_stackframe(&frame, save_trace, &data);
-}
-
-void save_stack_trace(struct stack_trace *trace)
-{
-	save_stack_trace_tsk(current, trace);
-}
-EXPORT_SYMBOL_GPL(save_stack_trace);
-#endif
diff --git a/arch/unicore32/kernel/sys.c b/arch/unicore32/kernel/sys.c
deleted file mode 100644
index 256fb4082296..000000000000
--- a/arch/unicore32/kernel/sys.c
+++ /dev/null
@@ -1,37 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * linux/arch/unicore32/kernel/sys.c
- *
- * Code specific to PKUnity SoC and UniCore ISA
- *
- * Copyright (C) 2001-2010 GUAN Xue-tao
- */
-#include <linux/module.h>
-#include <linux/errno.h>
-#include <linux/sched.h>
-#include <linux/slab.h>
-#include <linux/mm.h>
-#include <linux/sem.h>
-#include <linux/msg.h>
-#include <linux/shm.h>
-#include <linux/stat.h>
-#include <linux/syscalls.h>
-#include <linux/mman.h>
-#include <linux/fs.h>
-#include <linux/file.h>
-#include <linux/ipc.h>
-#include <linux/uaccess.h>
-
-#include <asm/syscalls.h>
-#include <asm/cacheflush.h>
-
-/* Provide the actual syscall number to call mapping. */
-#undef __SYSCALL
-#define __SYSCALL(nr, call)	[nr] = (call),
-
-#define sys_mmap2 sys_mmap_pgoff
-/* Note that we don't include <linux/unistd.h> but <asm/unistd.h> */
-void *sys_call_table[__NR_syscalls] = {
-	[0 ... __NR_syscalls-1] = sys_ni_syscall,
-#include <asm/unistd.h>
-};
diff --git a/arch/unicore32/kernel/time.c b/arch/unicore32/kernel/time.c
deleted file mode 100644
index c3a37edf4d40..000000000000
--- a/arch/unicore32/kernel/time.c
+++ /dev/null
@@ -1,128 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * linux/arch/unicore32/kernel/time.c
- *
- * Code specific to PKUnity SoC and UniCore ISA
- *
- *	Maintained by GUAN Xue-tao <gxt@mprc.pku.edu.cn>
- *	Copyright (C) 2001-2010 Guan Xuetao
- */
-#include <linux/init.h>
-#include <linux/errno.h>
-#include <linux/interrupt.h>
-#include <linux/irq.h>
-#include <linux/timex.h>
-#include <linux/clockchips.h>
-
-#include <mach/hardware.h>
-
-#define MIN_OSCR_DELTA 2
-
-static irqreturn_t puv3_ost0_interrupt(int irq, void *dev_id)
-{
-	struct clock_event_device *c = dev_id;
-
-	/* Disarm the compare/match, signal the event. */
-	writel(readl(OST_OIER) & ~OST_OIER_E0, OST_OIER);
-	writel(readl(OST_OSSR) & ~OST_OSSR_M0, OST_OSSR);
-	c->event_handler(c);
-
-	return IRQ_HANDLED;
-}
-
-static int
-puv3_osmr0_set_next_event(unsigned long delta, struct clock_event_device *c)
-{
-	unsigned long next, oscr;
-
-	writel(readl(OST_OIER) | OST_OIER_E0, OST_OIER);
-	next = readl(OST_OSCR) + delta;
-	writel(next, OST_OSMR0);
-	oscr = readl(OST_OSCR);
-
-	return (signed)(next - oscr) <= MIN_OSCR_DELTA ? -ETIME : 0;
-}
-
-static int puv3_osmr0_shutdown(struct clock_event_device *evt)
-{
-	writel(readl(OST_OIER) & ~OST_OIER_E0, OST_OIER);
-	writel(readl(OST_OSSR) & ~OST_OSSR_M0, OST_OSSR);
-	return 0;
-}
-
-static struct clock_event_device ckevt_puv3_osmr0 = {
-	.name			= "osmr0",
-	.features		= CLOCK_EVT_FEAT_ONESHOT,
-	.rating			= 200,
-	.set_next_event		= puv3_osmr0_set_next_event,
-	.set_state_shutdown	= puv3_osmr0_shutdown,
-	.set_state_oneshot	= puv3_osmr0_shutdown,
-};
-
-static u64 puv3_read_oscr(struct clocksource *cs)
-{
-	return readl(OST_OSCR);
-}
-
-static struct clocksource cksrc_puv3_oscr = {
-	.name		= "oscr",
-	.rating		= 200,
-	.read		= puv3_read_oscr,
-	.mask		= CLOCKSOURCE_MASK(32),
-	.flags		= CLOCK_SOURCE_IS_CONTINUOUS,
-};
-
-void __init time_init(void)
-{
-	writel(0, OST_OIER);		/* disable any timer interrupts */
-	writel(0, OST_OSSR);		/* clear status on all timers */
-
-	clockevents_calc_mult_shift(&ckevt_puv3_osmr0, CLOCK_TICK_RATE, 5);
-
-	ckevt_puv3_osmr0.max_delta_ns =
-		clockevent_delta2ns(0x7fffffff, &ckevt_puv3_osmr0);
-	ckevt_puv3_osmr0.max_delta_ticks = 0x7fffffff;
-	ckevt_puv3_osmr0.min_delta_ns =
-		clockevent_delta2ns(MIN_OSCR_DELTA * 2, &ckevt_puv3_osmr0) + 1;
-	ckevt_puv3_osmr0.min_delta_ticks = MIN_OSCR_DELTA * 2;
-	ckevt_puv3_osmr0.cpumask = cpumask_of(0);
-
-	if (request_irq(IRQ_TIMER0, puv3_ost0_interrupt,
-			IRQF_TIMER | IRQF_IRQPOLL, "ost0", &ckevt_puv3_osmr0))
-		pr_err("Failed to register ost0 interrupt\n");
-
-	clocksource_register_hz(&cksrc_puv3_oscr, CLOCK_TICK_RATE);
-	clockevents_register_device(&ckevt_puv3_osmr0);
-}
-
-#ifdef CONFIG_PM
-unsigned long osmr[4], oier;
-
-void puv3_timer_suspend(void)
-{
-	osmr[0] = readl(OST_OSMR0);
-	osmr[1] = readl(OST_OSMR1);
-	osmr[2] = readl(OST_OSMR2);
-	osmr[3] = readl(OST_OSMR3);
-	oier = readl(OST_OIER);
-}
-
-void puv3_timer_resume(void)
-{
-	writel(0, OST_OSSR);
-	writel(osmr[0], OST_OSMR0);
-	writel(osmr[1], OST_OSMR1);
-	writel(osmr[2], OST_OSMR2);
-	writel(osmr[3], OST_OSMR3);
-	writel(oier, OST_OIER);
-
-	/*
-	 * OSMR0 is the system timer: make sure OSCR is sufficiently behind
-	 */
-	writel(readl(OST_OSMR0) - LATCH, OST_OSCR);
-}
-#else
-void puv3_timer_suspend(void) { };
-void puv3_timer_resume(void) { };
-#endif
-
diff --git a/arch/unicore32/kernel/traps.c b/arch/unicore32/kernel/traps.c
deleted file mode 100644
index a3ac01df1a2e..000000000000
--- a/arch/unicore32/kernel/traps.c
+++ /dev/null
@@ -1,322 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * linux/arch/unicore32/kernel/traps.c
- *
- * Code specific to PKUnity SoC and UniCore ISA
- *
- * Copyright (C) 2001-2010 GUAN Xue-tao
- *
- *  'traps.c' handles hardware exceptions after we have saved some state.
- *  Mostly a debugging aid, but will probably kill the offending process.
- */
-#include <linux/module.h>
-#include <linux/signal.h>
-#include <linux/sched/signal.h>
-#include <linux/sched/debug.h>
-#include <linux/sched/task_stack.h>
-#include <linux/spinlock.h>
-#include <linux/personality.h>
-#include <linux/kallsyms.h>
-#include <linux/kdebug.h>
-#include <linux/uaccess.h>
-#include <linux/delay.h>
-#include <linux/hardirq.h>
-#include <linux/init.h>
-#include <linux/atomic.h>
-#include <linux/unistd.h>
-
-#include <asm/cacheflush.h>
-#include <asm/traps.h>
-
-#include "setup.h"
-
-static void dump_mem(const char *, const char *, unsigned long, unsigned long);
-
-void dump_backtrace_entry(unsigned long where,
-		unsigned long from, unsigned long frame)
-{
-#ifdef CONFIG_KALLSYMS
-	printk(KERN_DEFAULT "[<%08lx>] (%pS) from [<%08lx>] (%pS)\n",
-			where, (void *)where, from, (void *)from);
-#else
-	printk(KERN_DEFAULT "Function entered at [<%08lx>] from [<%08lx>]\n",
-			where, from);
-#endif
-}
-
-/*
- * Stack pointers should always be within the kernels view of
- * physical memory.  If it is not there, then we can't dump
- * out any information relating to the stack.
- */
-static int verify_stack(unsigned long sp)
-{
-	if (sp < PAGE_OFFSET ||
-	    (sp > (unsigned long)high_memory && high_memory != NULL))
-		return -EFAULT;
-
-	return 0;
-}
-
-/*
- * Dump out the contents of some memory nicely...
- */
-static void dump_mem(const char *lvl, const char *str, unsigned long bottom,
-		     unsigned long top)
-{
-	unsigned long first;
-	mm_segment_t fs;
-	int i;
-
-	/*
-	 * We need to switch to kernel mode so that we can use __get_user
-	 * to safely read from kernel space.  Note that we now dump the
-	 * code first, just in case the backtrace kills us.
-	 */
-	fs = get_fs();
-	set_fs(KERNEL_DS);
-
-	printk(KERN_DEFAULT "%s%s(0x%08lx to 0x%08lx)\n",
-			lvl, str, bottom, top);
-
-	for (first = bottom & ~31; first < top; first += 32) {
-		unsigned long p;
-		char str[sizeof(" 12345678") * 8 + 1];
-
-		memset(str, ' ', sizeof(str));
-		str[sizeof(str) - 1] = '\0';
-
-		for (p = first, i = 0; i < 8 && p < top; i++, p += 4) {
-			if (p >= bottom && p < top) {
-				unsigned long val;
-				if (__get_user(val, (unsigned long *)p) == 0)
-					sprintf(str + i * 9, " %08lx", val);
-				else
-					sprintf(str + i * 9, " ????????");
-			}
-		}
-		printk(KERN_DEFAULT "%s%04lx:%s\n", lvl, first & 0xffff, str);
-	}
-
-	set_fs(fs);
-}
-
-static void dump_instr(const char *lvl, struct pt_regs *regs)
-{
-	unsigned long addr = instruction_pointer(regs);
-	const int width = 8;
-	mm_segment_t fs;
-	char str[sizeof("00000000 ") * 5 + 2 + 1], *p = str;
-	int i;
-
-	/*
-	 * We need to switch to kernel mode so that we can use __get_user
-	 * to safely read from kernel space.  Note that we now dump the
-	 * code first, just in case the backtrace kills us.
-	 */
-	fs = get_fs();
-	set_fs(KERNEL_DS);
-
-	for (i = -4; i < 1; i++) {
-		unsigned int val, bad;
-
-		bad = __get_user(val, &((u32 *)addr)[i]);
-
-		if (!bad)
-			p += sprintf(p, i == 0 ? "(%0*x) " : "%0*x ",
-					width, val);
-		else {
-			p += sprintf(p, "bad PC value");
-			break;
-		}
-	}
-	printk(KERN_DEFAULT "%sCode: %s\n", lvl, str);
-
-	set_fs(fs);
-}
-
-static void dump_backtrace(struct pt_regs *regs, struct task_struct *tsk,
-			   const char *loglvl)
-{
-	unsigned int fp;
-	int ok = 1;
-
-	printk("%sBacktrace: ", loglvl);
-
-	if (!tsk)
-		tsk = current;
-
-	if (regs)
-		fp = regs->UCreg_fp;
-	else if (tsk != current)
-		fp = thread_saved_fp(tsk);
-	else
-		asm("mov %0, fp" : "=r" (fp) : : "cc");
-
-	if (!fp) {
-		printk("%sno frame pointer", loglvl);
-		ok = 0;
-	} else if (verify_stack(fp)) {
-		printk("%sinvalid frame pointer 0x%08x", loglvl, fp);
-		ok = 0;
-	} else if (fp < (unsigned long)end_of_stack(tsk))
-		printk("%sframe pointer underflow", loglvl);
-	printk("%s\n", loglvl);
-
-	if (ok)
-		c_backtrace(fp, loglvl);
-}
-
-void show_stack(struct task_struct *tsk, unsigned long *sp,
-		       const char *loglvl)
-{
-	dump_backtrace(NULL, tsk, loglvl);
-	barrier();
-}
-
-static int __die(const char *str, int err, struct thread_info *thread,
-		struct pt_regs *regs)
-{
-	struct task_struct *tsk = thread->task;
-	static int die_counter;
-	int ret;
-
-	printk(KERN_EMERG "Internal error: %s: %x [#%d]\n",
-	       str, err, ++die_counter);
-
-	/* trap and error numbers are mostly meaningless on UniCore */
-	ret = notify_die(DIE_OOPS, str, regs, err, tsk->thread.trap_no, \
-			SIGSEGV);
-	if (ret == NOTIFY_STOP)
-		return ret;
-
-	print_modules();
-	__show_regs(regs);
-	printk(KERN_EMERG "Process %.*s (pid: %d, stack limit = 0x%p)\n",
-		TASK_COMM_LEN, tsk->comm, task_pid_nr(tsk), thread + 1);
-
-	if (!user_mode(regs) || in_interrupt()) {
-		dump_mem(KERN_EMERG, "Stack: ", regs->UCreg_sp,
-			 THREAD_SIZE + (unsigned long)task_stack_page(tsk));
-		dump_backtrace(regs, tsk, KERN_EMERG);
-		dump_instr(KERN_EMERG, regs);
-	}
-
-	return ret;
-}
-
-DEFINE_SPINLOCK(die_lock);
-
-/*
- * This function is protected against re-entrancy.
- */
-void die(const char *str, struct pt_regs *regs, int err)
-{
-	struct thread_info *thread = current_thread_info();
-	int ret;
-
-	oops_enter();
-
-	spin_lock_irq(&die_lock);
-	console_verbose();
-	bust_spinlocks(1);
-	ret = __die(str, err, thread, regs);
-
-	bust_spinlocks(0);
-	add_taint(TAINT_DIE, LOCKDEP_NOW_UNRELIABLE);
-	spin_unlock_irq(&die_lock);
-	oops_exit();
-
-	if (in_interrupt())
-		panic("Fatal exception in interrupt");
-	if (panic_on_oops)
-		panic("Fatal exception");
-	if (ret != NOTIFY_STOP)
-		do_exit(SIGSEGV);
-}
-
-void uc32_notify_die(const char *str, struct pt_regs *regs,
-		int sig, int code, void __user *addr,
-		unsigned long err, unsigned long trap)
-{
-	if (user_mode(regs)) {
-		current->thread.error_code = err;
-		current->thread.trap_no = trap;
-
-		force_sig_fault(sig, code, addr);
-	} else
-		die(str, regs, err);
-}
-
-/*
- * bad_mode handles the impossible case in the vectors.  If you see one of
- * these, then it's extremely serious, and could mean you have buggy hardware.
- * It never returns, and never tries to sync.  We hope that we can at least
- * dump out some state information...
- */
-asmlinkage void bad_mode(struct pt_regs *regs, unsigned int reason)
-{
-	console_verbose();
-
-	printk(KERN_CRIT "Bad mode detected with reason 0x%x\n", reason);
-
-	die("Oops - bad mode", regs, 0);
-	local_irq_disable();
-	panic("bad mode");
-}
-
-void __pte_error(const char *file, int line, unsigned long val)
-{
-	printk(KERN_DEFAULT "%s:%d: bad pte %08lx.\n", file, line, val);
-}
-
-void __pmd_error(const char *file, int line, unsigned long val)
-{
-	printk(KERN_DEFAULT "%s:%d: bad pmd %08lx.\n", file, line, val);
-}
-
-void __pgd_error(const char *file, int line, unsigned long val)
-{
-	printk(KERN_DEFAULT "%s:%d: bad pgd %08lx.\n", file, line, val);
-}
-
-asmlinkage void __div0(void)
-{
-	printk(KERN_DEFAULT "Division by zero in kernel.\n");
-	dump_stack();
-}
-EXPORT_SYMBOL(__div0);
-
-void abort(void)
-{
-	BUG();
-
-	/* if that doesn't kill us, halt */
-	panic("Oops failed to kill thread");
-}
-
-void __init trap_init(void)
-{
-	return;
-}
-
-void __init early_trap_init(void)
-{
-	unsigned long vectors = VECTORS_BASE;
-
-	/*
-	 * Copy the vectors, stubs (in entry-unicore.S)
-	 * into the vector page, mapped at 0xffff0000, and ensure these
-	 * are visible to the instruction stream.
-	 */
-	memcpy((void *)vectors,
-			__vectors_start,
-			__vectors_end - __vectors_start);
-	memcpy((void *)vectors + 0x200,
-			__stubs_start,
-			__stubs_end - __stubs_start);
-
-	early_signal_init();
-
-	flush_icache_range(vectors, vectors + PAGE_SIZE);
-}
diff --git a/arch/unicore32/kernel/vmlinux.lds.S b/arch/unicore32/kernel/vmlinux.lds.S
deleted file mode 100644
index 6fb320b337ef..000000000000
--- a/arch/unicore32/kernel/vmlinux.lds.S
+++ /dev/null
@@ -1,59 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/*
- * linux/arch/unicore32/kernel/vmlinux.lds.S
- *
- * Code specific to PKUnity SoC and UniCore ISA
- *
- * Copyright (C) 2001-2010 GUAN Xue-tao
- */
-
-#include <asm-generic/vmlinux.lds.h>
-#include <asm/thread_info.h>
-#include <asm/memory.h>
-#include <asm/page.h>
-#include <asm/cache.h>
-
-OUTPUT_ARCH(unicore32)
-ENTRY(stext)
-
-jiffies = jiffies_64;
-
-SECTIONS
-{
-	. = PAGE_OFFSET + KERNEL_IMAGE_START;
-
-	_text = .;
-	__init_begin = .;
-	HEAD_TEXT_SECTION
-	INIT_TEXT_SECTION(PAGE_SIZE)
-	INIT_DATA_SECTION(16)
-	PERCPU_SECTION(L1_CACHE_BYTES)
-	__init_end = .;
-
-	_stext = .;
-	.text : {		/* Real text segment */
-		TEXT_TEXT
-		SCHED_TEXT
-		CPUIDLE_TEXT
-		LOCK_TEXT
-
-		*(.fixup)
-		*(.gnu.warning)
-	}
-	_etext = .;
-
-	_sdata = .;
-	RO_DATA(PAGE_SIZE)
-	RW_DATA(L1_CACHE_BYTES, PAGE_SIZE, THREAD_SIZE)
-	_edata = .;
-
-	EXCEPTION_TABLE(L1_CACHE_BYTES)
-
-	BSS_SECTION(0, 0, 0)
-	_end = .;
-
-	STABS_DEBUG
-	DWARF_DEBUG
-
-	DISCARDS		/* Exit code and data */
-}
diff --git a/arch/unicore32/lib/Makefile b/arch/unicore32/lib/Makefile
deleted file mode 100644
index 5af06645b8f0..000000000000
--- a/arch/unicore32/lib/Makefile
+++ /dev/null
@@ -1,28 +0,0 @@
-# SPDX-License-Identifier: GPL-2.0
-#
-# linux/arch/unicore32/lib/Makefile
-#
-# Copyright (C) 2001-2010 GUAN Xue-tao
-#
-
-lib-y	:= backtrace.o delay.o findbit.o
-lib-y	+= strncpy_from_user.o strnlen_user.o
-lib-y	+= clear_user.o copy_page.o
-lib-y	+= copy_from_user.o copy_to_user.o
-
-GNU_LIBC_A		= $(shell $(CC) $(KBUILD_CFLAGS) -print-file-name=libc.a)
-GNU_LIBC_A_OBJS		:= memchr.o memcpy.o memmove.o memset.o
-GNU_LIBC_A_OBJS		+= strchr.o strrchr.o
-GNU_LIBC_A_OBJS		+= rawmemchr.o			# needed by strrchr.o
-
-GNU_LIBGCC_A		= $(shell $(CC) $(KBUILD_CFLAGS) -print-file-name=libgcc.a)
-GNU_LIBGCC_A_OBJS	:= _ashldi3.o _ashrdi3.o _lshrdi3.o
-GNU_LIBGCC_A_OBJS	+= _divsi3.o _modsi3.o _ucmpdi2.o _umodsi3.o _udivsi3.o
-
-lib-y	+= $(GNU_LIBC_A_OBJS) $(GNU_LIBGCC_A_OBJS)
-
-$(addprefix $(obj)/, $(GNU_LIBC_A_OBJS)):
-	$(Q)$(AR) p $(GNU_LIBC_A) $(notdir $@) > $@
-
-$(addprefix $(obj)/, $(GNU_LIBGCC_A_OBJS)):
-	$(Q)$(AR) p $(GNU_LIBGCC_A) $(notdir $@) > $@
diff --git a/arch/unicore32/lib/backtrace.S b/arch/unicore32/lib/backtrace.S
deleted file mode 100644
index 6221944b81f3..000000000000
--- a/arch/unicore32/lib/backtrace.S
+++ /dev/null
@@ -1,168 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/*
- * linux/arch/unicore32/lib/backtrace.S
- *
- * Code specific to PKUnity SoC and UniCore ISA
- *
- * Copyright (C) 2001-2010 GUAN Xue-tao
- */
-#include <linux/linkage.h>
-#include <asm/assembler.h>
-		.text
-
-@ fp is 0 or stack frame
-
-#define frame	v4
-#define sv_fp	v5
-#define sv_pc	v6
-#define offset	v8
-#define loglvl	v9
-
-ENTRY(__backtrace)
-		mov	r0, fp
-
-ENTRY(c_backtrace)
-
-#if !defined(CONFIG_FRAME_POINTER) || !defined(CONFIG_PRINTK)
-		mov	pc, lr
-ENDPROC(__backtrace)
-ENDPROC(c_backtrace)
-#else
-		stm.w	(v4 - v10, lr), [sp-]	@ Save an extra register
-						@ so we have a location...
-		mov.a	frame, r0		@ if frame pointer is zero
-		beq	no_frame		@ we have no stack frames
-		mov	loglvl, r1
-
-1:		stm.w	(pc), [sp-]		@ calculate offset of PC stored
-		ldw.w	r0, [sp]+, #4		@ by stmfd for this CPU
-		adr	r1, 1b
-		sub	offset, r0, r1
-
-/*
- * Stack frame layout:
- *             optionally saved caller registers (r4 - r10)
- *             saved fp
- *             saved sp
- *             saved lr
- *    frame => saved pc
- *             optionally saved arguments (r0 - r3)
- * saved sp => <next word>
- *
- * Functions start with the following code sequence:
- *                  mov   ip, sp
- *                  stm.w (r0 - r3), [sp-] (optional)
- * corrected pc =>  stm.w sp, (..., fp, ip, lr, pc)
- */
-for_each_frame:
-
-1001:		ldw	sv_pc, [frame+], #0	@ get saved pc
-1002:		ldw	sv_fp, [frame+], #-12	@ get saved fp
-
-		sub	sv_pc, sv_pc, offset	@ Correct PC for prefetching
-
-1003:		ldw	r2, [sv_pc+], #-4	@ if stmfd sp, {args} exists,
-		ldw	r3, .Ldsi+4		@ adjust saved 'pc' back one
-		cxor.a	r3, r2 >> #14		@ instruction
-		beq	201f
-		sub	r0, sv_pc, #4		@ allow for mov
-		b	202f
-201:
-		sub	r0, sv_pc, #8		@ allow for mov + stmia
-202:
-		ldw	r1, [frame+], #-4	@ get saved lr
-		mov	r2, frame
-		b.l	dump_backtrace_entry
-
-		ldw	r1, [sv_pc+], #-4	@ if stmfd sp, {args} exists,
-		ldw	r3, .Ldsi+4
-		cxor.a	r3, r1 >> #14
-		bne	1004f
-		ldw	r0, [frame+], #-8	@ get sp
-		sub	r0, r0, #4		@ point at the last arg
-		b.l	.Ldumpstm		@ dump saved registers
-
-1004:		ldw	r1, [sv_pc+], #0	@ if stmfd {, fp, ip, lr, pc}
-		ldw	r3, .Ldsi		@ instruction exists,
-		cxor.a	r3, r1 >> #14
-		bne	201f
-		sub	r0, frame, #16
-		b.l	.Ldumpstm		@ dump saved registers
-201:
-		cxor.a	sv_fp, #0		@ zero saved fp means
-		beq	no_frame		@ no further frames
-
-		csub.a	sv_fp, frame		@ next frame must be
-		mov	frame, sv_fp		@ above the current frame
-		bua	for_each_frame
-
-1006:		adr	r0, .Lbad
-		mov	r1, loglvl
-		mov	r2, frame
-		b.l	printk
-no_frame:	ldm.w	(v4 - v10, pc), [sp]+
-ENDPROC(__backtrace)
-ENDPROC(c_backtrace)
-
-		.pushsection __ex_table,"a"
-		.align	3
-		.long	1001b, 1006b
-		.long	1002b, 1006b
-		.long	1003b, 1006b
-		.long	1004b, 1006b
-		.popsection
-
-#define instr v4
-#define reg   v5
-#define stack v6
-
-.Ldumpstm:	stm.w	(instr, reg, stack, v7, lr), [sp-]
-		mov	stack, r0
-		mov	instr, r1
-		mov	reg, #14
-		mov	v7, #0
-1:		mov	r3, #1
-		csub.a	reg, #8
-		bne	201f
-		sub	reg, reg, #3
-201:
-		cand.a	instr, r3 << reg
-		beq	2f
-		add	v7, v7, #1
-		cxor.a	v7, #6
-		cmoveq	v7, #1
-		bne	201f
-		adr	r0, .Lcr
-		mov	r1, loglvl
-		b.l	printk
-201:
-		ldw.w	r3, [stack]+, #-4
-		mov	r2, reg
-		csub.a	r2, #8
-		bsl	201f
-		sub	r2, r2, #3
-201:
-		cand.a	instr, #0x40		@ if H is 1, high 16 regs
-		beq	201f
-		add	r2, r2, #0x10		@ so r2 need add 16
-201:
-		adr	r0, .Lfp
-		mov	r1, loglvl
-		b.l	printk
-2:		sub.a	reg, reg, #1
-		bns	1b
-		cxor.a	v7, #0
-		beq	201f
-		adr	r0, .Lcr
-		mov	r1, loglvl
-		b.l	printk
-201:		ldm.w	(instr, reg, stack, v7, pc), [sp]+
-
-.Lfp:		.asciz	"%sr%d:%08x "
-.Lcr:		.asciz	"%s\n"
-.Lbad:		.asciz	"%sBacktrace aborted due to bad frame pointer <%p>\n"
-		.align
-.Ldsi:		.word	0x92eec000 >> 14	@ stm.w sp, (... fp, ip, lr, pc)
-		.word	0x92e10000 >> 14	@ stm.w sp, ()
-
-#endif
diff --git a/arch/unicore32/lib/clear_user.S b/arch/unicore32/lib/clear_user.S
deleted file mode 100644
index c6ca431b1090..000000000000
--- a/arch/unicore32/lib/clear_user.S
+++ /dev/null
@@ -1,54 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/*
- * linux/arch/unicore32/lib/clear_user.S
- *
- * Code specific to PKUnity SoC and UniCore ISA
- *
- * Copyright (C) 2001-2010 GUAN Xue-tao
- */
-#include <linux/linkage.h>
-#include <asm/assembler.h>
-
-		.text
-
-/* Prototype: int __clear_user(void *addr, size_t sz)
- * Purpose  : clear some user memory
- * Params   : addr - user memory address to clear
- *          : sz   - number of bytes to clear
- * Returns  : number of bytes NOT cleared
- */
-WEAK(__clear_user)
-		stm.w	(lr), [sp-]
-		stm.w	(r1), [sp-]
-		mov	r2, #0
-		csub.a	r1, #4
-		bsl	2f
-		and.a	ip, r0, #3
-		beq	1f
-		csub.a	ip, #2
-		strusr	r2, r0, 1
-		strusr	r2, r0, 1, el
-		strusr	r2, r0, 1, sl
-		rsub	ip, ip, #4
-		sub	r1, r1, ip		@  7  6  5  4  3  2  1
-1:		sub.a	r1, r1, #8		@ -1 -2 -3 -4 -5 -6 -7
-		strusr	r2, r0, 4, ns, rept=2
-		bns	1b
-		add.a	r1, r1, #4		@  3  2  1  0 -1 -2 -3
-		strusr	r2, r0, 4, ns
-2:		cand.a	r1, #2			@ 1x 1x 0x 0x 1x 1x 0x
-		strusr	r2, r0, 1, ne, rept=2
-		cand.a	r1, #1			@ x1 x0 x1 x0 x1 x0 x1
-		beq	3f
-USER(		stb.u	r2, [r0])
-3:		mov	r0, #0
-		ldm.w	(r1), [sp]+
-		ldm.w	(pc), [sp]+
-ENDPROC(__clear_user)
-
-		.pushsection .fixup,"ax"
-		.align	0
-9001:		ldm.w	(r0), [sp]+
-		ldm.w	(pc), [sp]+
-		.popsection
-
diff --git a/arch/unicore32/lib/copy_from_user.S b/arch/unicore32/lib/copy_from_user.S
deleted file mode 100644
index affb43920ac0..000000000000
--- a/arch/unicore32/lib/copy_from_user.S
+++ /dev/null
@@ -1,101 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/*
- * linux/arch/unicore32/lib/copy_from_user.S
- *
- * Code specific to PKUnity SoC and UniCore ISA
- *
- * Copyright (C) 2001-2010 GUAN Xue-tao
- */
-
-#include <linux/linkage.h>
-#include <asm/assembler.h>
-
-/*
- * Prototype:
- *
- *	size_t raw_copy_from_user(void *to, const void *from, size_t n)
- *
- * Purpose:
- *
- *	copy a block to kernel memory from user memory
- *
- * Params:
- *
- *	to = kernel memory
- *	from = user memory
- *	n = number of bytes to copy
- *
- * Return value:
- *
- *	Number of bytes NOT copied.
- */
-
-	.macro ldr1w ptr reg abort
-	ldrusr	\reg, \ptr, 4, abort=\abort
-	.endm
-
-	.macro ldr4w ptr reg1 reg2 reg3 reg4 abort
-100:	ldm.w	(\reg1, \reg2, \reg3, \reg4), [\ptr]+
-	.pushsection __ex_table, "a"
-	.align	3
-	.long 100b, \abort
-	.popsection
-	.endm
-
-	.macro ldr8w ptr reg1 reg2 reg3 reg4 reg5 reg6 reg7 reg8 abort
-100:	ldm.w (\reg1, \reg2, \reg3, \reg4, \reg5, \reg6, \reg7, \reg8), [\ptr]+
-	.pushsection __ex_table, "a"
-	.align	3
-	.long 100b, \abort
-	.popsection
-	.endm
-
-	.macro ldr1b ptr reg cond=al abort
-	ldrusr	\reg, \ptr, 1, \cond, abort=\abort
-	.endm
-
-	.macro str1w ptr reg abort
-	stw.w \reg, [\ptr]+, #4
-	.endm
-
-	.macro str8w ptr reg1 reg2 reg3 reg4 reg5 reg6 reg7 reg8 abort
-	stm.w (\reg1, \reg2, \reg3, \reg4, \reg5, \reg6, \reg7, \reg8), [\ptr]+
-	.endm
-
-	.macro str1b ptr reg cond=al abort
-	.ifnc	\cond, al
-	b\cond	201f
-	b	202f
-	.endif
-201:	stb.w \reg, [\ptr]+, #1
-202:
-	.endm
-
-	.macro enter
-	mov	r3, #0
-	stm.w	(r0, r2, r3), [sp-]
-	.endm
-
-	.macro exit
-	add	sp, sp, #8
-	ldm.w	(r0), [sp]+
-	mov	pc, lr
-	.endm
-
-	.text
-
-ENTRY(raw_copy_from_user)
-
-#include "copy_template.S"
-
-ENDPROC(raw_copy_from_user)
-
-	.pushsection .fixup,"ax"
-	.align 0
-	copy_abort_preamble
-	ldm.w	(r1, r2, r3), [sp]+
-	sub	r0, r0, r1
-	rsub	r0, r0, r2
-	copy_abort_end
-	.popsection
-
diff --git a/arch/unicore32/lib/copy_page.S b/arch/unicore32/lib/copy_page.S
deleted file mode 100644
index dc163f2d1af0..000000000000
--- a/arch/unicore32/lib/copy_page.S
+++ /dev/null
@@ -1,36 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/*
- * linux/arch/unicore32/lib/copy_page.S
- *
- * Code specific to PKUnity SoC and UniCore ISA
- *
- * Copyright (C) 2001-2010 GUAN Xue-tao
- *
- *  ASM optimised string functions
- */
-#include <linux/linkage.h>
-#include <asm/assembler.h>
-#include <generated/asm-offsets.h>
-#include <asm/cache.h>
-
-#define COPY_COUNT (PAGE_SZ/256)
-
-		.text
-		.align	5
-/*
- * UniCore optimised copy_page routine
- */
-ENTRY(copy_page)
-		stm.w	(r17 - r19, lr), [sp-]
-		mov	r17, r0
-		mov	r18, r1
-		mov	r19, #COPY_COUNT
-1:
-	.rept	4
-		ldm.w	(r0 - r15), [r18]+
-		stm.w	(r0 - r15), [r17]+
-	.endr
-		sub.a	r19, r19, #1
-		bne	1b
-		ldm.w	(r17 - r19, pc), [sp]+
-ENDPROC(copy_page)
diff --git a/arch/unicore32/lib/copy_template.S b/arch/unicore32/lib/copy_template.S
deleted file mode 100644
index 02a7aef83fbf..000000000000
--- a/arch/unicore32/lib/copy_template.S
+++ /dev/null
@@ -1,211 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/*
- * linux/arch/unicore32/lib/copy_template.S
- *
- * Code specific to PKUnity SoC and UniCore ISA
- *
- * Copyright (C) 2001-2010 GUAN Xue-tao
- */
-
-/*
- * Theory of operation
- * -------------------
- *
- * This file provides the core code for a forward memory copy used in
- * the implementation of memcopy(), copy_to_user() and copy_from_user().
- *
- * The including file must define the following accessor macros
- * according to the need of the given function:
- *
- * ldr1w ptr reg abort
- *
- *	This loads one word from 'ptr', stores it in 'reg' and increments
- *	'ptr' to the next word. The 'abort' argument is used for fixup tables.
- *
- * ldr4w ptr reg1 reg2 reg3 reg4 abort
- * ldr8w ptr, reg1 reg2 reg3 reg4 reg5 reg6 reg7 reg8 abort
- *
- *	This loads four or eight words starting from 'ptr', stores them
- *	in provided registers and increments 'ptr' past those words.
- *	The'abort' argument is used for fixup tables.
- *
- * ldr1b ptr reg cond abort
- *
- *	Similar to ldr1w, but it loads a byte and increments 'ptr' one byte.
- *	It also must apply the condition code if provided, otherwise the
- *	"al" condition is assumed by default.
- *
- * str1w ptr reg abort
- * str8w ptr reg1 reg2 reg3 reg4 reg5 reg6 reg7 reg8 abort
- * str1b ptr reg cond abort
- *
- *	Same as their ldr* counterparts, but data is stored to 'ptr' location
- *	rather than being loaded.
- *
- * enter
- *
- *	Preserve the provided registers on the stack plus any additional
- *	data as needed by the implementation including this code. Called
- *	upon code entry.
- *
- * exit
- *
- *	Restore registers with the values previously saved with the
- *	'preserv' macro. Called upon code termination.
- */
-
-
-		enter
-
-		sub.a	r2, r2, #4
-		bsl	8f
-		and.a	ip, r0, #3
-		bne	9f
-		and.a	ip, r1, #3
-		bne	10f
-
-1:		sub.a	r2, r2, #(28)
-		stm.w	(r5 - r8), [sp-]
-		bsl	5f
-
-3:
-4:		ldr8w	r1, r3, r4, r5, r6, r7, r8, r10, r11, abort=20f
-		sub.a	r2, r2, #32
-		str8w	r0, r3, r4, r5, r6, r7, r8, r10, r11, abort=20f
-		beg	3b
-
-5:		and.a	ip, r2, #28
-		rsub	ip, ip, #32
-		beq	7f
-		add	pc, pc, ip		@ C is always clear here
-		nop
-
-		ldr1w	r1, r3, abort=20f
-		ldr1w	r1, r4, abort=20f
-		ldr1w	r1, r5, abort=20f
-		ldr1w	r1, r6, abort=20f
-		ldr1w	r1, r7, abort=20f
-		ldr1w	r1, r8, abort=20f
-		ldr1w	r1, r11, abort=20f
-
-		add	pc, pc, ip
-		nop
-
-		str1w	r0, r3, abort=20f
-		str1w	r0, r4, abort=20f
-		str1w	r0, r5, abort=20f
-		str1w	r0, r6, abort=20f
-		str1w	r0, r7, abort=20f
-		str1w	r0, r8, abort=20f
-		str1w	r0, r11, abort=20f
-
-7:		ldm.w	(r5 - r8), [sp]+
-
-8:		mov.a	r2, r2 << #31
-		ldr1b	r1, r3, ne, abort=21f
-		ldr1b	r1, r4, ea, abort=21f
-		ldr1b	r1, r10, ea, abort=21f
-		str1b	r0, r3, ne, abort=21f
-		str1b	r0, r4, ea, abort=21f
-		str1b	r0, r10, ea, abort=21f
-
-		exit
-
-9:		rsub	ip, ip, #4
-		csub.a	ip, #2
-		ldr1b	r1, r3, sg, abort=21f
-		ldr1b	r1, r4, eg, abort=21f
-		ldr1b	r1, r11, abort=21f
-		str1b	r0, r3, sg, abort=21f
-		str1b	r0, r4, eg, abort=21f
-		sub.a	r2, r2, ip
-		str1b	r0, r11, abort=21f
-		bsl	8b
-		and.a	ip, r1, #3
-		beq	1b
-
-10:		andn	r1, r1, #3
-		csub.a	ip, #2
-		ldr1w	r1, r11, abort=21f
-		beq	17f
-		bsg	18f
-
-
-		.macro	forward_copy_shift a b
-
-		sub.a	r2, r2, #28
-		bsl	14f
-
-11:		stm.w	(r5 - r9), [sp-]
-
-12:
-		ldr4w	r1, r4, r5, r6, r7, abort=19f
-		mov	r3, r11 pull #\a
-		sub.a	r2, r2, #32
-		ldr4w	r1, r8, r9, r10, r11, abort=19f
-		or	r3, r3, r4 push #\b
-		mov	r4, r4 pull #\a
-		or	r4, r4, r5 push #\b
-		mov	r5, r5 pull #\a
-		or	r5, r5, r6 push #\b
-		mov	r6, r6 pull #\a
-		or	r6, r6, r7 push #\b
-		mov	r7, r7 pull #\a
-		or	r7, r7, r8 push #\b
-		mov	r8, r8 pull #\a
-		or	r8, r8, r9 push #\b
-		mov	r9, r9 pull #\a
-		or	r9, r9, r10 push #\b
-		mov	r10, r10 pull #\a
-		or	r10, r10, r11 push #\b
-		str8w	r0, r3, r4, r5, r6, r7, r8, r9, r10, , abort=19f
-		beg	12b
-
-		ldm.w	(r5 - r9), [sp]+
-
-14:		and.a	ip, r2, #28
-		beq	16f
-
-15:		mov	r3, r11 pull #\a
-		ldr1w	r1, r11, abort=21f
-		sub.a	ip, ip, #4
-		or	r3, r3, r11 push #\b
-		str1w	r0, r3, abort=21f
-		bsg	15b
-
-16:		sub	r1, r1, #(\b / 8)
-		b	8b
-
-		.endm
-
-
-		forward_copy_shift	a=8	b=24
-
-17:		forward_copy_shift	a=16	b=16
-
-18:		forward_copy_shift	a=24	b=8
-
-
-/*
- * Abort preamble and completion macros.
- * If a fixup handler is required then those macros must surround it.
- * It is assumed that the fixup code will handle the private part of
- * the exit macro.
- */
-
-	.macro	copy_abort_preamble
-19:	ldm.w	(r5 - r9), [sp]+
-	b	21f
-299:	.word	0			@ store lr
-					@ to avoid function call in fixup
-20:	ldm.w	(r5 - r8), [sp]+
-21:
-	adr	r1, 299b
-	stw	lr, [r1]
-	.endm
-
-	.macro	copy_abort_end
-	adr	lr, 299b
-	ldw	pc, [lr]
-	.endm
-
diff --git a/arch/unicore32/lib/copy_to_user.S b/arch/unicore32/lib/copy_to_user.S
deleted file mode 100644
index c867f08f89ce..000000000000
--- a/arch/unicore32/lib/copy_to_user.S
+++ /dev/null
@@ -1,93 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/*
- * linux/arch/unicore32/lib/copy_to_user.S
- *
- * Code specific to PKUnity SoC and UniCore ISA
- *
- * Copyright (C) 2001-2010 GUAN Xue-tao
- */
-
-#include <linux/linkage.h>
-#include <asm/assembler.h>
-
-/*
- * Prototype:
- *
- *	size_t raw_copy_to_user(void *to, const void *from, size_t n)
- *
- * Purpose:
- *
- *	copy a block to user memory from kernel memory
- *
- * Params:
- *
- *	to = user memory
- *	from = kernel memory
- *	n = number of bytes to copy
- *
- * Return value:
- *
- *	Number of bytes NOT copied.
- */
-
-	.macro ldr1w ptr reg abort
-	ldw.w \reg, [\ptr]+, #4
-	.endm
-
-	.macro ldr4w ptr reg1 reg2 reg3 reg4 abort
-	ldm.w	(\reg1, \reg2, \reg3, \reg4), [\ptr]+
-	.endm
-
-	.macro ldr8w ptr reg1 reg2 reg3 reg4 reg5 reg6 reg7 reg8 abort
-	ldm.w (\reg1, \reg2, \reg3, \reg4, \reg5, \reg6, \reg7, \reg8), [\ptr]+
-	.endm
-
-	.macro ldr1b ptr reg cond=al abort
-	notcond	\cond, .+8
-	ldb.w \reg, [\ptr]+, #1
-	.endm
-
-	.macro str1w ptr reg abort
-	strusr	\reg, \ptr, 4, abort=\abort
-	.endm
-
-	.macro str8w ptr reg1 reg2 reg3 reg4 reg5 reg6 reg7 reg8 abort
-100:	stm.w (\reg1, \reg2, \reg3, \reg4, \reg5, \reg6, \reg7, \reg8), [\ptr]+
-
-	.pushsection __ex_table, "a"
-	.long 100b, \abort
-	.popsection
-	.endm
-
-	.macro str1b ptr reg cond=al abort
-	strusr	\reg, \ptr, 1, \cond, abort=\abort
-	.endm
-
-	.macro enter
-	mov	r3, #0
-	stm.w	(r0, r2, r3), [sp-]
-	.endm
-
-	.macro exit
-	add	sp, sp, #8
-	ldm.w	(r0), [sp]+
-	mov	pc, lr
-	.endm
-
-	.text
-
-WEAK(raw_copy_to_user)
-
-#include "copy_template.S"
-
-ENDPROC(raw_copy_to_user)
-
-	.pushsection .fixup,"ax"
-	.align 0
-	copy_abort_preamble
-	ldm.w	(r1, r2, r3), [sp]+
-	sub	r0, r0, r1
-	rsub	r0, r0, r2
-	copy_abort_end
-	.popsection
-
diff --git a/arch/unicore32/lib/delay.S b/arch/unicore32/lib/delay.S
deleted file mode 100644
index 6a359dd034e5..000000000000
--- a/arch/unicore32/lib/delay.S
+++ /dev/null
@@ -1,48 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/*
- * linux/arch/unicore32/lib/delay.S
- *
- * Code specific to PKUnity SoC and UniCore ISA
- *
- * Copyright (C) 2001-2010 GUAN Xue-tao
- */
-#include <linux/linkage.h>
-#include <asm/assembler.h>
-#include <asm/param.h>
-		.text
-
-.LC0:		.word	loops_per_jiffy
-.LC1:		.word	(2199023*HZ)>>11
-
-/*
- * r0  <= 2000
- * lpj <= 0x01ffffff (max. 3355 bogomips)
- * HZ  <= 1000
- */
-
-ENTRY(__udelay)
-		ldw	r2, .LC1
-		mul	r0, r2, r0
-ENTRY(__const_udelay)				@ 0 <= r0 <= 0x7fffff06
-		ldw	r2, .LC0
-		ldw	r2, [r2]		@ max = 0x01ffffff
-		mov	r0, r0 >> #14		@ max = 0x0001ffff
-		mov	r2, r2 >> #10		@ max = 0x00007fff
-		mul	r0, r2, r0		@ max = 2^32-1
-		mov.a	r0, r0 >> #6
-		cmoveq	pc, lr
-
-/*
- * loops = r0 * HZ * loops_per_jiffy / 1000000
- *
- * Oh, if only we had a cycle counter...
- */
-
-@ Delay routine
-ENTRY(__delay)
-		sub.a	r0, r0, #2
-		bua	__delay
-		mov	pc, lr
-ENDPROC(__udelay)
-ENDPROC(__const_udelay)
-ENDPROC(__delay)
diff --git a/arch/unicore32/lib/findbit.S b/arch/unicore32/lib/findbit.S
deleted file mode 100644
index 42f1282670d2..000000000000
--- a/arch/unicore32/lib/findbit.S
+++ /dev/null
@@ -1,97 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/*
- * linux/arch/unicore32/lib/findbit.S
- *
- * Code specific to PKUnity SoC and UniCore ISA
- *
- * Copyright (C) 2001-2010 GUAN Xue-tao
- */
-#include <linux/linkage.h>
-#include <asm/assembler.h>
-                .text
-
-/*
- * Purpose  : Find a 'zero' bit
- * Prototype: int find_first_zero_bit(void *addr, unsigned int maxbit);
- */
-ENTRY(find_first_zero_bit)
-		cxor.a	r1, #0
-		beq	3f
-		mov	r2, #0
-1:		ldb	r3, [r0+], r2 >> #3
-		xor.a	r3, r3, #0xff		@ invert bits
-		bne	.L_found		@ any now set - found zero bit
-		add	r2, r2, #8		@ next bit pointer
-2:		csub.a	r2, r1			@ any more?
-		bub	1b
-3:		mov	r0, r1			@ no free bits
-		mov	pc, lr
-ENDPROC(find_first_zero_bit)
-
-/*
- * Purpose  : Find next 'zero' bit
- * Prototype: int find_next_zero_bit
- *		(void *addr, unsigned int maxbit, int offset)
- */
-ENTRY(find_next_zero_bit)
-		cxor.a	r1, #0
-		beq	3b
-		and.a	ip, r2, #7
-		beq	1b			@ If new byte, goto old routine
-		ldb	r3, [r0+], r2 >> #3
-		xor	r3, r3, #0xff		@ now looking for a 1 bit
-		mov.a	r3, r3 >> ip		@ shift off unused bits
-		bne	.L_found
-		or	r2, r2, #7		@ if zero, then no bits here
-		add	r2, r2, #1		@ align bit pointer
-		b	2b			@ loop for next bit
-ENDPROC(find_next_zero_bit)
-
-/*
- * Purpose  : Find a 'one' bit
- * Prototype: int find_first_bit
- *		(const unsigned long *addr, unsigned int maxbit);
- */
-ENTRY(find_first_bit)
-		cxor.a	r1, #0
-		beq	3f
-		mov	r2, #0
-1:		ldb	r3, [r0+], r2 >> #3
-		mov.a	r3, r3
-		bne	.L_found		@ any now set - found zero bit
-		add	r2, r2, #8		@ next bit pointer
-2:		csub.a	r2, r1			@ any more?
-		bub	1b
-3:		mov	r0, r1			@ no free bits
-		mov	pc, lr
-ENDPROC(find_first_bit)
-
-/*
- * Purpose  : Find next 'one' bit
- * Prototype: int find_next_zero_bit
- *		(void *addr, unsigned int maxbit, int offset)
- */
-ENTRY(find_next_bit)
-		cxor.a	r1, #0
-		beq	3b
-		and.a	ip, r2, #7
-		beq	1b			@ If new byte, goto old routine
-		ldb	r3, [r0+], r2 >> #3
-		mov.a	r3, r3 >> ip		@ shift off unused bits
-		bne	.L_found
-		or	r2, r2, #7		@ if zero, then no bits here
-		add	r2, r2, #1		@ align bit pointer
-		b	2b			@ loop for next bit
-ENDPROC(find_next_bit)
-
-/*
- * One or more bits in the LSB of r3 are assumed to be set.
- */
-.L_found:
-		rsub	r1, r3, #0
-		and	r3, r3, r1
-		cntlz	r3, r3
-		rsub	r3, r3, #31
-		add	r0, r2, r3
-		mov	pc, lr
-
diff --git a/arch/unicore32/lib/strncpy_from_user.S b/arch/unicore32/lib/strncpy_from_user.S
deleted file mode 100644
index f227b8227a4c..000000000000
--- a/arch/unicore32/lib/strncpy_from_user.S
+++ /dev/null
@@ -1,42 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/*
- * linux/arch/unicore32/lib/strncpy_from_user.S
- *
- * Code specific to PKUnity SoC and UniCore ISA
- *
- * Copyright (C) 2001-2010 GUAN Xue-tao
- */
-#include <linux/linkage.h>
-#include <asm/assembler.h>
-#include <asm/errno.h>
-
-	.text
-	.align	5
-
-/*
- * Copy a string from user space to kernel space.
- *  r0 = dst, r1 = src, r2 = byte length
- * returns the number of characters copied (strlen of copied string),
- *  -EFAULT on exception, or "len" if we fill the whole buffer
- */
-ENTRY(__strncpy_from_user)
-	mov	ip, r1
-1:	sub.a	r2, r2, #1
-	ldrusr	r3, r1, 1, ns
-	bfs	2f
-	stb.w	r3, [r0]+, #1
-	cxor.a	r3, #0
-	bne	1b
-	sub	r1, r1, #1	@ take NUL character out of count
-2:	sub	r0, r1, ip
-	mov	pc, lr
-ENDPROC(__strncpy_from_user)
-
-	.pushsection .fixup,"ax"
-	.align	0
-9001:	mov	r3, #0
-	stb	r3, [r0+], #0	@ null terminate
-	mov	r0, #-EFAULT
-	mov	pc, lr
-	.popsection
-
diff --git a/arch/unicore32/lib/strnlen_user.S b/arch/unicore32/lib/strnlen_user.S
deleted file mode 100644
index c836b12776fe..000000000000
--- a/arch/unicore32/lib/strnlen_user.S
+++ /dev/null
@@ -1,39 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/*
- * linux/arch/unicore32/lib/strnlen_user.S
- *
- * Code specific to PKUnity SoC and UniCore ISA
- *
- * Copyright (C) 2001-2010 GUAN Xue-tao
- */
-#include <linux/linkage.h>
-#include <asm/assembler.h>
-#include <asm/errno.h>
-
-	.text
-	.align	5
-
-/* Prototype: unsigned long __strnlen_user(const char *str, long n)
- * Purpose  : get length of a string in user memory
- * Params   : str - address of string in user memory
- * Returns  : length of string *including terminator*
- *	      or zero on exception, or n + 1 if too long
- */
-ENTRY(__strnlen_user)
-	mov	r2, r0
-1:
-	ldrusr	r3, r0, 1
-	cxor.a	r3, #0
-	beq	2f
-	sub.a	r1, r1, #1
-	bne	1b
-	add	r0, r0, #1
-2:	sub	r0, r0, r2
-	mov	pc, lr
-ENDPROC(__strnlen_user)
-
-	.pushsection .fixup,"ax"
-	.align	0
-9001:	mov	r0, #0
-	mov	pc, lr
-	.popsection
diff --git a/arch/unicore32/mm/Kconfig b/arch/unicore32/mm/Kconfig
deleted file mode 100644
index 82759b6aba67..000000000000
--- a/arch/unicore32/mm/Kconfig
+++ /dev/null
@@ -1,41 +0,0 @@
-# SPDX-License-Identifier: GPL-2.0
-comment "Processor Type"
-
-# Select CPU types depending on the architecture selected.  This selects
-# which CPUs we support in the kernel image, and the compiler instruction
-# optimiser behaviour.
-
-config CPU_UCV2
-	def_bool y
-
-comment "Processor Features"
-
-config CPU_ICACHE_DISABLE
-	bool "Disable I-Cache (I-bit)"
-	help
-	  Say Y here to disable the processor instruction cache. Unless
-	  you have a reason not to or are unsure, say N.
-
-config CPU_DCACHE_DISABLE
-	bool "Disable D-Cache (D-bit)"
-	help
-	  Say Y here to disable the processor data cache. Unless
-	  you have a reason not to or are unsure, say N.
-
-config CPU_DCACHE_WRITETHROUGH
-	bool "Force write through D-cache"
-	help
-	  Say Y here to use the data cache in writethrough mode. Unless you
-	  specifically require this or are unsure, say N.
-
-config CPU_DCACHE_LINE_DISABLE
-	bool "Disable D-cache line ops"
-	default y
-	help
-	  Say Y here to disable the data cache line operations.
-
-config CPU_TLB_SINGLE_ENTRY_DISABLE
-	bool "Disable TLB single entry ops"
-	default y
-	help
-	  Say Y here to disable the TLB single entry operations.
diff --git a/arch/unicore32/mm/Makefile b/arch/unicore32/mm/Makefile
deleted file mode 100644
index 8106260583ab..000000000000
--- a/arch/unicore32/mm/Makefile
+++ /dev/null
@@ -1,14 +0,0 @@
-# SPDX-License-Identifier: GPL-2.0
-#
-# Makefile for the linux unicore-specific parts of the memory manager.
-#
-
-obj-y				:= extable.o fault.o init.o pgd.o mmu.o
-obj-y				+= flush.o ioremap.o
-
-obj-$(CONFIG_MODULES)		+= proc-syms.o
-
-obj-$(CONFIG_ALIGNMENT_TRAP)	+= alignment.o
-
-obj-$(CONFIG_CPU_UCV2)		+= cache-ucv2.o tlb-ucv2.o proc-ucv2.o
-
diff --git a/arch/unicore32/mm/alignment.c b/arch/unicore32/mm/alignment.c
deleted file mode 100644
index 2ea98f7a4156..000000000000
--- a/arch/unicore32/mm/alignment.c
+++ /dev/null
@@ -1,524 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * linux/arch/unicore32/mm/alignment.c
- *
- * Code specific to PKUnity SoC and UniCore ISA
- *
- * Copyright (C) 2001-2010 GUAN Xue-tao
- */
-/*
- * TODO:
- *  FPU ldm/stm not handling
- */
-#include <linux/compiler.h>
-#include <linux/kernel.h>
-#include <linux/sched/debug.h>
-#include <linux/errno.h>
-#include <linux/string.h>
-#include <linux/init.h>
-#include <linux/sched.h>
-#include <linux/uaccess.h>
-#include <linux/pgtable.h>
-
-#include <asm/tlbflush.h>
-#include <asm/unaligned.h>
-
-#include "mm.h"
-
-#define CODING_BITS(i)	(i & 0xe0000120)
-
-#define LDST_P_BIT(i)	(i & (1 << 28))	/* Preindex             */
-#define LDST_U_BIT(i)	(i & (1 << 27))	/* Add offset           */
-#define LDST_W_BIT(i)	(i & (1 << 25))	/* Writeback            */
-#define LDST_L_BIT(i)	(i & (1 << 24))	/* Load                 */
-
-#define LDST_P_EQ_U(i)	((((i) ^ ((i) >> 1)) & (1 << 27)) == 0)
-
-#define LDSTH_I_BIT(i)	(i & (1 << 26))	/* half-word immed      */
-#define LDM_S_BIT(i)	(i & (1 << 26))	/* write ASR from BSR */
-#define LDM_H_BIT(i)	(i & (1 << 6))	/* select r0-r15 or r16-r31 */
-
-#define RN_BITS(i)	((i >> 19) & 31)	/* Rn                   */
-#define RD_BITS(i)	((i >> 14) & 31)	/* Rd                   */
-#define RM_BITS(i)	(i & 31)	/* Rm                   */
-
-#define REGMASK_BITS(i)	(((i & 0x7fe00) >> 3) | (i & 0x3f))
-#define OFFSET_BITS(i)	(i & 0x03fff)
-
-#define SHIFT_BITS(i)	((i >> 9) & 0x1f)
-#define SHIFT_TYPE(i)	(i & 0xc0)
-#define SHIFT_LSL	0x00
-#define SHIFT_LSR	0x40
-#define SHIFT_ASR	0x80
-#define SHIFT_RORRRX	0xc0
-
-union offset_union {
-	unsigned long un;
-	signed long sn;
-};
-
-#define TYPE_ERROR	0
-#define TYPE_FAULT	1
-#define TYPE_LDST	2
-#define TYPE_DONE	3
-#define TYPE_SWAP  4
-#define TYPE_COLS  5		/* Coprocessor load/store */
-
-#define get8_unaligned_check(val, addr, err)		\
-	__asm__(					\
-	"1:	ldb.u	%1, [%2], #1\n"			\
-	"2:\n"						\
-	"	.pushsection .fixup,\"ax\"\n"		\
-	"	.align	2\n"				\
-	"3:	mov	%0, #1\n"			\
-	"	b	2b\n"				\
-	"	.popsection\n"				\
-	"	.pushsection __ex_table,\"a\"\n"		\
-	"	.align	3\n"				\
-	"	.long	1b, 3b\n"			\
-	"	.popsection\n"				\
-	: "=r" (err), "=&r" (val), "=r" (addr)		\
-	: "0" (err), "2" (addr))
-
-#define get8t_unaligned_check(val, addr, err)		\
-	__asm__(					\
-	"1:	ldb.u	%1, [%2], #1\n"			\
-	"2:\n"						\
-	"	.pushsection .fixup,\"ax\"\n"		\
-	"	.align	2\n"				\
-	"3:	mov	%0, #1\n"			\
-	"	b	2b\n"				\
-	"	.popsection\n"				\
-	"	.pushsection __ex_table,\"a\"\n"		\
-	"	.align	3\n"				\
-	"	.long	1b, 3b\n"			\
-	"	.popsection\n"				\
-	: "=r" (err), "=&r" (val), "=r" (addr)		\
-	: "0" (err), "2" (addr))
-
-#define get16_unaligned_check(val, addr)			\
-	do {							\
-		unsigned int err = 0, v, a = addr;		\
-		get8_unaligned_check(val, a, err);		\
-		get8_unaligned_check(v, a, err);		\
-		val |= v << 8;					\
-		if (err)					\
-			goto fault;				\
-	} while (0)
-
-#define put16_unaligned_check(val, addr)			\
-	do {							\
-		unsigned int err = 0, v = val, a = addr;	\
-		__asm__(					\
-		"1:	stb.u	%1, [%2], #1\n"			\
-		"	mov	%1, %1 >> #8\n"			\
-		"2:	stb.u	%1, [%2]\n"			\
-		"3:\n"						\
-		"	.pushsection .fixup,\"ax\"\n"		\
-		"	.align	2\n"				\
-		"4:	mov	%0, #1\n"			\
-		"	b	3b\n"				\
-		"	.popsection\n"				\
-		"	.pushsection __ex_table,\"a\"\n"		\
-		"	.align	3\n"				\
-		"	.long	1b, 4b\n"			\
-		"	.long	2b, 4b\n"			\
-		"	.popsection\n"				\
-		: "=r" (err), "=&r" (v), "=&r" (a)		\
-		: "0" (err), "1" (v), "2" (a));			\
-		if (err)					\
-			goto fault;				\
-	} while (0)
-
-#define __put32_unaligned_check(ins, val, addr)			\
-	do {							\
-		unsigned int err = 0, v = val, a = addr;	\
-		__asm__(					\
-		"1:	"ins"	%1, [%2], #1\n"			\
-		"	mov	%1, %1 >> #8\n"			\
-		"2:	"ins"	%1, [%2], #1\n"			\
-		"	mov	%1, %1 >> #8\n"			\
-		"3:	"ins"	%1, [%2], #1\n"			\
-		"	mov	%1, %1 >> #8\n"			\
-		"4:	"ins"	%1, [%2]\n"			\
-		"5:\n"						\
-		"	.pushsection .fixup,\"ax\"\n"		\
-		"	.align	2\n"				\
-		"6:	mov	%0, #1\n"			\
-		"	b	5b\n"				\
-		"	.popsection\n"				\
-		"	.pushsection __ex_table,\"a\"\n"		\
-		"	.align	3\n"				\
-		"	.long	1b, 6b\n"			\
-		"	.long	2b, 6b\n"			\
-		"	.long	3b, 6b\n"			\
-		"	.long	4b, 6b\n"			\
-		"	.popsection\n"				\
-		: "=r" (err), "=&r" (v), "=&r" (a)		\
-		: "0" (err), "1" (v), "2" (a));			\
-		if (err)					\
-			goto fault;				\
-	} while (0)
-
-#define get32_unaligned_check(val, addr)			\
-	do {							\
-		unsigned int err = 0, v, a = addr;		\
-		get8_unaligned_check(val, a, err);		\
-		get8_unaligned_check(v, a, err);		\
-		val |= v << 8;					\
-		get8_unaligned_check(v, a, err);		\
-		val |= v << 16;					\
-		get8_unaligned_check(v, a, err);		\
-		val |= v << 24;					\
-		if (err)					\
-			goto fault;				\
-	} while (0)
-
-#define put32_unaligned_check(val, addr)			\
-	__put32_unaligned_check("stb.u", val, addr)
-
-#define get32t_unaligned_check(val, addr)			\
-	do {							\
-		unsigned int err = 0, v, a = addr;		\
-		get8t_unaligned_check(val, a, err);		\
-		get8t_unaligned_check(v, a, err);		\
-		val |= v << 8;					\
-		get8t_unaligned_check(v, a, err);		\
-		val |= v << 16;					\
-		get8t_unaligned_check(v, a, err);		\
-		val |= v << 24;					\
-		if (err)					\
-			goto fault;				\
-	} while (0)
-
-#define put32t_unaligned_check(val, addr)			\
-	__put32_unaligned_check("stb.u", val, addr)
-
-static void
-do_alignment_finish_ldst(unsigned long addr, unsigned long instr,
-			 struct pt_regs *regs, union offset_union offset)
-{
-	if (!LDST_U_BIT(instr))
-		offset.un = -offset.un;
-
-	if (!LDST_P_BIT(instr))
-		addr += offset.un;
-
-	if (!LDST_P_BIT(instr) || LDST_W_BIT(instr))
-		regs->uregs[RN_BITS(instr)] = addr;
-}
-
-static int
-do_alignment_ldrhstrh(unsigned long addr, unsigned long instr,
-		      struct pt_regs *regs)
-{
-	unsigned int rd = RD_BITS(instr);
-
-	/* old value 0x40002120, can't judge swap instr correctly */
-	if ((instr & 0x4b003fe0) == 0x40000120)
-		goto swp;
-
-	if (LDST_L_BIT(instr)) {
-		unsigned long val;
-		get16_unaligned_check(val, addr);
-
-		/* signed half-word? */
-		if (instr & 0x80)
-			val = (signed long)((signed short)val);
-
-		regs->uregs[rd] = val;
-	} else
-		put16_unaligned_check(regs->uregs[rd], addr);
-
-	return TYPE_LDST;
-
-swp:
-	/* only handle swap word
-	 * for swap byte should not active this alignment exception */
-	get32_unaligned_check(regs->uregs[RD_BITS(instr)], addr);
-	put32_unaligned_check(regs->uregs[RM_BITS(instr)], addr);
-	return TYPE_SWAP;
-
-fault:
-	return TYPE_FAULT;
-}
-
-static int
-do_alignment_ldrstr(unsigned long addr, unsigned long instr,
-		    struct pt_regs *regs)
-{
-	unsigned int rd = RD_BITS(instr);
-
-	if (!LDST_P_BIT(instr) && LDST_W_BIT(instr))
-		goto trans;
-
-	if (LDST_L_BIT(instr))
-		get32_unaligned_check(regs->uregs[rd], addr);
-	else
-		put32_unaligned_check(regs->uregs[rd], addr);
-	return TYPE_LDST;
-
-trans:
-	if (LDST_L_BIT(instr))
-		get32t_unaligned_check(regs->uregs[rd], addr);
-	else
-		put32t_unaligned_check(regs->uregs[rd], addr);
-	return TYPE_LDST;
-
-fault:
-	return TYPE_FAULT;
-}
-
-/*
- * LDM/STM alignment handler.
- *
- * There are 4 variants of this instruction:
- *
- * B = rn pointer before instruction, A = rn pointer after instruction
- *              ------ increasing address ----->
- *	        |    | r0 | r1 | ... | rx |    |
- * PU = 01             B                    A
- * PU = 11        B                    A
- * PU = 00        A                    B
- * PU = 10             A                    B
- */
-static int
-do_alignment_ldmstm(unsigned long addr, unsigned long instr,
-		    struct pt_regs *regs)
-{
-	unsigned int rd, rn, pc_correction, reg_correction, nr_regs, regbits;
-	unsigned long eaddr, newaddr;
-
-	if (LDM_S_BIT(instr))
-		goto bad;
-
-	pc_correction = 4;	/* processor implementation defined */
-
-	/* count the number of registers in the mask to be transferred */
-	nr_regs = hweight16(REGMASK_BITS(instr)) * 4;
-
-	rn = RN_BITS(instr);
-	newaddr = eaddr = regs->uregs[rn];
-
-	if (!LDST_U_BIT(instr))
-		nr_regs = -nr_regs;
-	newaddr += nr_regs;
-	if (!LDST_U_BIT(instr))
-		eaddr = newaddr;
-
-	if (LDST_P_EQ_U(instr))	/* U = P */
-		eaddr += 4;
-
-	/*
-	 * This is a "hint" - we already have eaddr worked out by the
-	 * processor for us.
-	 */
-	if (addr != eaddr) {
-		printk(KERN_ERR "LDMSTM: PC = %08lx, instr = %08lx, "
-		       "addr = %08lx, eaddr = %08lx\n",
-		       instruction_pointer(regs), instr, addr, eaddr);
-		show_regs(regs);
-	}
-
-	if (LDM_H_BIT(instr))
-		reg_correction = 0x10;
-	else
-		reg_correction = 0x00;
-
-	for (regbits = REGMASK_BITS(instr), rd = 0; regbits;
-	     regbits >>= 1, rd += 1)
-		if (regbits & 1) {
-			if (LDST_L_BIT(instr))
-				get32_unaligned_check(regs->
-					uregs[rd + reg_correction], eaddr);
-			else
-				put32_unaligned_check(regs->
-					uregs[rd + reg_correction], eaddr);
-			eaddr += 4;
-		}
-
-	if (LDST_W_BIT(instr))
-		regs->uregs[rn] = newaddr;
-	return TYPE_DONE;
-
-fault:
-	regs->UCreg_pc -= pc_correction;
-	return TYPE_FAULT;
-
-bad:
-	printk(KERN_ERR "Alignment trap: not handling ldm with s-bit set\n");
-	return TYPE_ERROR;
-}
-
-static int
-do_alignment(unsigned long addr, unsigned int error_code, struct pt_regs *regs)
-{
-	union offset_union offset;
-	unsigned long instr, instrptr;
-	int (*handler) (unsigned long addr, unsigned long instr,
-			struct pt_regs *regs);
-	unsigned int type;
-
-	instrptr = instruction_pointer(regs);
-	if (instrptr >= PAGE_OFFSET)
-		instr = *(unsigned long *)instrptr;
-	else {
-		__asm__ __volatile__(
-				"ldw.u	%0, [%1]\n"
-				: "=&r"(instr)
-				: "r"(instrptr));
-	}
-
-	regs->UCreg_pc += 4;
-
-	switch (CODING_BITS(instr)) {
-	case 0x40000120:	/* ldrh or strh */
-		if (LDSTH_I_BIT(instr))
-			offset.un = (instr & 0x3e00) >> 4 | (instr & 31);
-		else
-			offset.un = regs->uregs[RM_BITS(instr)];
-		handler = do_alignment_ldrhstrh;
-		break;
-
-	case 0x60000000:	/* ldr or str immediate */
-	case 0x60000100:	/* ldr or str immediate */
-	case 0x60000020:	/* ldr or str immediate */
-	case 0x60000120:	/* ldr or str immediate */
-		offset.un = OFFSET_BITS(instr);
-		handler = do_alignment_ldrstr;
-		break;
-
-	case 0x40000000:	/* ldr or str register */
-		offset.un = regs->uregs[RM_BITS(instr)];
-		{
-			unsigned int shiftval = SHIFT_BITS(instr);
-
-			switch (SHIFT_TYPE(instr)) {
-			case SHIFT_LSL:
-				offset.un <<= shiftval;
-				break;
-
-			case SHIFT_LSR:
-				offset.un >>= shiftval;
-				break;
-
-			case SHIFT_ASR:
-				offset.sn >>= shiftval;
-				break;
-
-			case SHIFT_RORRRX:
-				if (shiftval == 0) {
-					offset.un >>= 1;
-					if (regs->UCreg_asr & PSR_C_BIT)
-						offset.un |= 1 << 31;
-				} else
-					offset.un = offset.un >> shiftval |
-					    offset.un << (32 - shiftval);
-				break;
-			}
-		}
-		handler = do_alignment_ldrstr;
-		break;
-
-	case 0x80000000:	/* ldm or stm */
-	case 0x80000020:	/* ldm or stm */
-		handler = do_alignment_ldmstm;
-		break;
-
-	default:
-		goto bad;
-	}
-
-	type = handler(addr, instr, regs);
-
-	if (type == TYPE_ERROR || type == TYPE_FAULT)
-		goto bad_or_fault;
-
-	if (type == TYPE_LDST)
-		do_alignment_finish_ldst(addr, instr, regs, offset);
-
-	return 0;
-
-bad_or_fault:
-	if (type == TYPE_ERROR)
-		goto bad;
-	regs->UCreg_pc -= 4;
-	/*
-	 * We got a fault - fix it up, or die.
-	 */
-	do_bad_area(addr, error_code, regs);
-	return 0;
-
-bad:
-	/*
-	 * Oops, we didn't handle the instruction.
-	 * However, we must handle fpu instr firstly.
-	 */
-#ifdef CONFIG_UNICORE_FPU_F64
-	/* handle co.load/store */
-#define CODING_COLS                0xc0000000
-#define COLS_OFFSET_BITS(i)	(i & 0x1FF)
-#define COLS_L_BITS(i)		(i & (1<<24))
-#define COLS_FN_BITS(i)		((i>>14) & 31)
-	if ((instr & 0xe0000000) == CODING_COLS) {
-		unsigned int fn = COLS_FN_BITS(instr);
-		unsigned long val = 0;
-		if (COLS_L_BITS(instr)) {
-			get32t_unaligned_check(val, addr);
-			switch (fn) {
-#define ASM_MTF(n)	case n:						\
-			__asm__ __volatile__("MTF %0, F" __stringify(n)	\
-				: : "r"(val));				\
-			break;
-			ASM_MTF(0); ASM_MTF(1); ASM_MTF(2); ASM_MTF(3);
-			ASM_MTF(4); ASM_MTF(5); ASM_MTF(6); ASM_MTF(7);
-			ASM_MTF(8); ASM_MTF(9); ASM_MTF(10); ASM_MTF(11);
-			ASM_MTF(12); ASM_MTF(13); ASM_MTF(14); ASM_MTF(15);
-			ASM_MTF(16); ASM_MTF(17); ASM_MTF(18); ASM_MTF(19);
-			ASM_MTF(20); ASM_MTF(21); ASM_MTF(22); ASM_MTF(23);
-			ASM_MTF(24); ASM_MTF(25); ASM_MTF(26); ASM_MTF(27);
-			ASM_MTF(28); ASM_MTF(29); ASM_MTF(30); ASM_MTF(31);
-#undef ASM_MTF
-			}
-		} else {
-			switch (fn) {
-#define ASM_MFF(n)	case n:						\
-			__asm__ __volatile__("MFF %0, F" __stringify(n)	\
-				: : "r"(val));				\
-			break;
-			ASM_MFF(0); ASM_MFF(1); ASM_MFF(2); ASM_MFF(3);
-			ASM_MFF(4); ASM_MFF(5); ASM_MFF(6); ASM_MFF(7);
-			ASM_MFF(8); ASM_MFF(9); ASM_MFF(10); ASM_MFF(11);
-			ASM_MFF(12); ASM_MFF(13); ASM_MFF(14); ASM_MFF(15);
-			ASM_MFF(16); ASM_MFF(17); ASM_MFF(18); ASM_MFF(19);
-			ASM_MFF(20); ASM_MFF(21); ASM_MFF(22); ASM_MFF(23);
-			ASM_MFF(24); ASM_MFF(25); ASM_MFF(26); ASM_MFF(27);
-			ASM_MFF(28); ASM_MFF(29); ASM_MFF(30); ASM_MFF(31);
-#undef ASM_MFF
-			}
-			put32t_unaligned_check(val, addr);
-		}
-		return TYPE_COLS;
-	}
-fault:
-	return TYPE_FAULT;
-#endif
-	printk(KERN_ERR "Alignment trap: not handling instruction "
-	       "%08lx at [<%08lx>]\n", instr, instrptr);
-	return 1;
-}
-
-/*
- * This needs to be done after sysctl_init, otherwise sys/ will be
- * overwritten.  Actually, this shouldn't be in sys/ at all since
- * it isn't a sysctl, and it doesn't contain sysctl information.
- */
-static int __init alignment_init(void)
-{
-	hook_fault_code(1, do_alignment, SIGBUS, BUS_ADRALN,
-			"alignment exception");
-
-	return 0;
-}
-
-fs_initcall(alignment_init);
diff --git a/arch/unicore32/mm/cache-ucv2.S b/arch/unicore32/mm/cache-ucv2.S
deleted file mode 100644
index 2108837d6f4f..000000000000
--- a/arch/unicore32/mm/cache-ucv2.S
+++ /dev/null
@@ -1,209 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/*
- * linux/arch/unicore32/mm/cache-ucv2.S
- *
- * Code specific to PKUnity SoC and UniCore ISA
- *
- * Copyright (C) 2001-2010 GUAN Xue-tao
- *
- *  This is the "shell" of the UniCore-v2 processor support.
- */
-#include <linux/linkage.h>
-#include <linux/init.h>
-#include <asm/assembler.h>
-#include <asm/page.h>
-
-#include "proc-macros.S"
-
-/*
- *	__cpuc_flush_icache_all()
- *	__cpuc_flush_kern_all()
- *	__cpuc_flush_user_all()
- *
- *	Flush the entire cache.
- */
-ENTRY(__cpuc_flush_icache_all)
-	/*FALLTHROUGH*/
-ENTRY(__cpuc_flush_kern_all)
-	/*FALLTHROUGH*/
-ENTRY(__cpuc_flush_user_all)
-	mov	r0, #0
-	movc	p0.c5, r0, #14			@ Dcache flush all
-	nop8
-
-	mov	r0, #0
-	movc	p0.c5, r0, #20			@ Icache invalidate all
-	nop8
-
-	mov	pc, lr
-
-/*
- *	__cpuc_flush_user_range(start, end, flags)
- *
- *	Flush a range of TLB entries in the specified address space.
- *
- *	- start - start address (may not be aligned)
- *	- end   - end address (exclusive, may not be aligned)
- *	- flags	- vm_area_struct flags describing address space
- */
-ENTRY(__cpuc_flush_user_range)
-	cxor.a	r2, #0
-	beq	__cpuc_dma_flush_range
-
-#ifndef CONFIG_CPU_DCACHE_LINE_DISABLE
-	andn	r0, r0, #CACHE_LINESIZE - 1	@ Safety check
-	sub	r1, r1, r0
-	csub.a	r1, #MAX_AREA_SIZE
-	bsg	2f
-
-	andn	r1, r1, #CACHE_LINESIZE - 1
-	add	r1, r1, #CACHE_LINESIZE
-
-101:	dcacheline_flush	r0, r11, r12
-
-	add	r0, r0, #CACHE_LINESIZE
-	sub.a	r1, r1, #CACHE_LINESIZE
-	bns	101b
-	b	3f
-#endif
-2:	mov	ip, #0
-	movc	p0.c5, ip, #14			@ Dcache flush all
-	nop8
-
-3:	mov	ip, #0
-	movc	p0.c5, ip, #20			@ Icache invalidate all
-	nop8
-
-	mov	pc, lr
-
-/*
- *	__cpuc_coherent_kern_range(start,end)
- *	__cpuc_coherent_user_range(start,end)
- *
- *	Ensure that the I and D caches are coherent within specified
- *	region.  This is typically used when code has been written to
- *	a memory region, and will be executed.
- *
- *	- start   - virtual start address of region
- *	- end     - virtual end address of region
- */
-ENTRY(__cpuc_coherent_kern_range)
-	/* FALLTHROUGH */
-ENTRY(__cpuc_coherent_user_range)
-#ifndef CONFIG_CPU_DCACHE_LINE_DISABLE
-	andn	r0, r0, #CACHE_LINESIZE - 1	@ Safety check
-	sub	r1, r1, r0
-	csub.a	r1, #MAX_AREA_SIZE
-	bsg	2f
-
-	andn	r1, r1, #CACHE_LINESIZE - 1
-	add	r1, r1, #CACHE_LINESIZE
-
-	@ r0 va2pa r10
-	mov	r9, #PAGE_SZ
-	sub	r9, r9, #1			@ PAGE_MASK
-101:	va2pa	r0, r10, r11, r12, r13, 2f	@ r10 is PA
-	b	103f
-102:	cand.a	r0, r9
-	beq	101b
-
-103:	movc	p0.c5, r10, #11			@ Dcache clean line of R10
-	nop8
-
-	add	r0, r0, #CACHE_LINESIZE
-	add	r10, r10, #CACHE_LINESIZE
-	sub.a	r1, r1, #CACHE_LINESIZE
-	bns	102b
-	b	3f
-#endif
-2:	mov	ip, #0
-	movc	p0.c5, ip, #10			@ Dcache clean all
-	nop8
-
-3:	mov	ip, #0
-	movc	p0.c5, ip, #20			@ Icache invalidate all
-	nop8
-
-	mov	pc, lr
-
-/*
- *	__cpuc_flush_kern_dcache_area(void *addr, size_t size)
- *
- *	- addr	- kernel address
- *	- size	- region size
- */
-ENTRY(__cpuc_flush_kern_dcache_area)
-	mov	ip, #0
-	movc	p0.c5, ip, #14			@ Dcache flush all
-	nop8
-	mov	pc, lr
-
-/*
- *	__cpuc_dma_clean_range(start,end)
- *	- start   - virtual start address of region
- *	- end     - virtual end address of region
- */
-ENTRY(__cpuc_dma_clean_range)
-#ifndef CONFIG_CPU_DCACHE_LINE_DISABLE
-	andn	r0, r0, #CACHE_LINESIZE - 1
-	sub	r1, r1, r0
-	andn	r1, r1, #CACHE_LINESIZE - 1
-	add	r1, r1, #CACHE_LINESIZE
-
-	csub.a	r1, #MAX_AREA_SIZE
-	bsg	2f
-
-	@ r0 va2pa r10
-	mov	r9, #PAGE_SZ
-	sub	r9, r9, #1			@ PAGE_MASK
-101:	va2pa	r0, r10, r11, r12, r13, 2f	@ r10 is PA
-	b	1f
-102:	cand.a	r0, r9
-	beq	101b
-
-1:	movc	p0.c5, r10, #11			@ Dcache clean line of R10
-	nop8
-	add	r0, r0, #CACHE_LINESIZE
-	add	r10, r10, #CACHE_LINESIZE
-	sub.a	r1, r1, #CACHE_LINESIZE
-	bns	102b
-	mov	pc, lr
-#endif
-2:	mov	ip, #0
-	movc	p0.c5, ip, #10			@ Dcache clean all
-	nop8
-
-	mov	pc, lr
-
-/*
- *	__cpuc_dma_inv_range(start,end)
- *	__cpuc_dma_flush_range(start,end)
- *	- start   - virtual start address of region
- *	- end     - virtual end address of region
- */
-__cpuc_dma_inv_range:
-	/* FALLTHROUGH */
-ENTRY(__cpuc_dma_flush_range)
-#ifndef CONFIG_CPU_DCACHE_LINE_DISABLE
-	andn	r0, r0, #CACHE_LINESIZE - 1
-	sub	r1, r1, r0
-	andn	r1, r1, #CACHE_LINESIZE - 1
-	add	r1, r1, #CACHE_LINESIZE
-
-	csub.a	r1, #MAX_AREA_SIZE
-	bsg	2f
-
-	@ r0 va2pa r10
-101:	dcacheline_flush	r0, r11, r12
-
-	add	r0, r0, #CACHE_LINESIZE
-	sub.a	r1, r1, #CACHE_LINESIZE
-	bns	101b
-	mov	pc, lr
-#endif
-2:	mov	ip, #0
-	movc	p0.c5, ip, #14			@ Dcache flush all
-	nop8
-
-	mov	pc, lr
-
diff --git a/arch/unicore32/mm/extable.c b/arch/unicore32/mm/extable.c
deleted file mode 100644
index e53352b41c4a..000000000000
--- a/arch/unicore32/mm/extable.c
+++ /dev/null
@@ -1,21 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * linux/arch/unicore32/mm/extable.c
- *
- * Code specific to PKUnity SoC and UniCore ISA
- *
- * Copyright (C) 2001-2010 GUAN Xue-tao
- */
-#include <linux/extable.h>
-#include <linux/uaccess.h>
-
-int fixup_exception(struct pt_regs *regs)
-{
-	const struct exception_table_entry *fixup;
-
-	fixup = search_exception_tables(instruction_pointer(regs));
-	if (fixup)
-		regs->UCreg_pc = fixup->fixup;
-
-	return fixup != NULL;
-}
diff --git a/arch/unicore32/mm/fault.c b/arch/unicore32/mm/fault.c
deleted file mode 100644
index 7654bddde133..000000000000
--- a/arch/unicore32/mm/fault.c
+++ /dev/null
@@ -1,481 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * linux/arch/unicore32/mm/fault.c
- *
- * Code specific to PKUnity SoC and UniCore ISA
- *
- * Copyright (C) 2001-2010 GUAN Xue-tao
- */
-#include <linux/extable.h>
-#include <linux/signal.h>
-#include <linux/mm.h>
-#include <linux/hardirq.h>
-#include <linux/init.h>
-#include <linux/kprobes.h>
-#include <linux/uaccess.h>
-#include <linux/page-flags.h>
-#include <linux/sched/signal.h>
-#include <linux/io.h>
-
-#include <asm/tlbflush.h>
-
-/*
- * Fault status register encodings.  We steal bit 31 for our own purposes.
- */
-#define FSR_LNX_PF		(1 << 31)
-
-static inline int fsr_fs(unsigned int fsr)
-{
-	/* xyabcde will be abcde+xy */
-	return (fsr & 31) + ((fsr & (3 << 5)) >> 5);
-}
-
-/*
- * This is useful to dump out the page tables associated with
- * 'addr' in mm 'mm'.
- */
-void show_pte(struct mm_struct *mm, unsigned long addr)
-{
-	pgd_t *pgd;
-
-	if (!mm)
-		mm = &init_mm;
-
-	printk(KERN_ALERT "pgd = %p\n", mm->pgd);
-	pgd = pgd_offset(mm, addr);
-	printk(KERN_ALERT "[%08lx] *pgd=%08lx", addr, pgd_val(*pgd));
-
-	do {
-		pmd_t *pmd;
-		pte_t *pte;
-
-		if (pgd_none(*pgd))
-			break;
-
-		if (pgd_bad(*pgd)) {
-			printk("(bad)");
-			break;
-		}
-
-		pmd = pmd_offset((pud_t *) pgd, addr);
-		if (PTRS_PER_PMD != 1)
-			printk(", *pmd=%08lx", pmd_val(*pmd));
-
-		if (pmd_none(*pmd))
-			break;
-
-		if (pmd_bad(*pmd)) {
-			printk("(bad)");
-			break;
-		}
-
-		/* We must not map this if we have highmem enabled */
-		if (PageHighMem(pfn_to_page(pmd_val(*pmd) >> PAGE_SHIFT)))
-			break;
-
-		pte = pte_offset_map(pmd, addr);
-		printk(", *pte=%08lx", pte_val(*pte));
-		pte_unmap(pte);
-	} while (0);
-
-	printk("\n");
-}
-
-/*
- * Oops.  The kernel tried to access some page that wasn't present.
- */
-static void __do_kernel_fault(struct mm_struct *mm, unsigned long addr,
-		unsigned int fsr, struct pt_regs *regs)
-{
-	/*
-	 * Are we prepared to handle this kernel fault?
-	 */
-	if (fixup_exception(regs))
-		return;
-
-	/*
-	 * No handler, we'll have to terminate things with extreme prejudice.
-	 */
-	bust_spinlocks(1);
-	printk(KERN_ALERT
-	       "Unable to handle kernel %s at virtual address %08lx\n",
-	       (addr < PAGE_SIZE) ? "NULL pointer dereference" :
-	       "paging request", addr);
-
-	show_pte(mm, addr);
-	die("Oops", regs, fsr);
-	bust_spinlocks(0);
-	do_exit(SIGKILL);
-}
-
-/*
- * Something tried to access memory that isn't in our memory map..
- * User mode accesses just cause a SIGSEGV
- */
-static void __do_user_fault(unsigned long addr, unsigned int fsr,
-			    unsigned int sig, int code,	struct pt_regs *regs)
-{
-	struct task_struct *tsk = current;
-
-	tsk->thread.address = addr;
-	tsk->thread.error_code = fsr;
-	tsk->thread.trap_no = 14;
-	force_sig_fault(sig, code, (void __user *)addr);
-}
-
-void do_bad_area(unsigned long addr, unsigned int fsr, struct pt_regs *regs)
-{
-	struct task_struct *tsk = current;
-	struct mm_struct *mm = tsk->active_mm;
-
-	/*
-	 * If we are in kernel mode at this point, we
-	 * have no context to handle this fault with.
-	 */
-	if (user_mode(regs))
-		__do_user_fault(addr, fsr, SIGSEGV, SEGV_MAPERR, regs);
-	else
-		__do_kernel_fault(mm, addr, fsr, regs);
-}
-
-#define VM_FAULT_BADMAP		0x010000
-#define VM_FAULT_BADACCESS	0x020000
-
-/*
- * Check that the permissions on the VMA allow for the fault which occurred.
- * If we encountered a write fault, we must have write permission, otherwise
- * we allow any permission.
- */
-static inline bool access_error(unsigned int fsr, struct vm_area_struct *vma)
-{
-	unsigned int mask = VM_ACCESS_FLAGS;
-
-	if (!(fsr ^ 0x12))	/* write? */
-		mask = VM_WRITE;
-	if (fsr & FSR_LNX_PF)
-		mask = VM_EXEC;
-
-	return vma->vm_flags & mask ? false : true;
-}
-
-static vm_fault_t __do_pf(struct mm_struct *mm, unsigned long addr,
-		unsigned int fsr, unsigned int flags, struct task_struct *tsk)
-{
-	struct vm_area_struct *vma;
-	vm_fault_t fault;
-
-	vma = find_vma(mm, addr);
-	fault = VM_FAULT_BADMAP;
-	if (unlikely(!vma))
-		goto out;
-	if (unlikely(vma->vm_start > addr))
-		goto check_stack;
-
-	/*
-	 * Ok, we have a good vm_area for this
-	 * memory access, so we can handle it.
-	 */
-good_area:
-	if (access_error(fsr, vma)) {
-		fault = VM_FAULT_BADACCESS;
-		goto out;
-	}
-
-	/*
-	 * If for any reason at all we couldn't handle the fault, make
-	 * sure we exit gracefully rather than endlessly redo the fault.
-	 */
-	fault = handle_mm_fault(vma, addr & PAGE_MASK, flags);
-	return fault;
-
-check_stack:
-	if (vma->vm_flags & VM_GROWSDOWN && !expand_stack(vma, addr))
-		goto good_area;
-out:
-	return fault;
-}
-
-static int do_pf(unsigned long addr, unsigned int fsr, struct pt_regs *regs)
-{
-	struct task_struct *tsk;
-	struct mm_struct *mm;
-	int sig, code;
-	vm_fault_t fault;
-	unsigned int flags = FAULT_FLAG_DEFAULT;
-
-	tsk = current;
-	mm = tsk->mm;
-
-	/*
-	 * If we're in an interrupt or have no user
-	 * context, we must not take the fault..
-	 */
-	if (faulthandler_disabled() || !mm)
-		goto no_context;
-
-	if (user_mode(regs))
-		flags |= FAULT_FLAG_USER;
-	if (!(fsr ^ 0x12))
-		flags |= FAULT_FLAG_WRITE;
-
-	/*
-	 * As per x86, we may deadlock here.  However, since the kernel only
-	 * validly references user space from well defined areas of the code,
-	 * we can bug out early if this is from code which shouldn't.
-	 */
-	if (!mmap_read_trylock(mm)) {
-		if (!user_mode(regs)
-		    && !search_exception_tables(regs->UCreg_pc))
-			goto no_context;
-retry:
-		mmap_read_lock(mm);
-	} else {
-		/*
-		 * The above down_read_trylock() might have succeeded in
-		 * which case, we'll have missed the might_sleep() from
-		 * down_read()
-		 */
-		might_sleep();
-#ifdef CONFIG_DEBUG_VM
-		if (!user_mode(regs) &&
-		    !search_exception_tables(regs->UCreg_pc))
-			goto no_context;
-#endif
-	}
-
-	fault = __do_pf(mm, addr, fsr, flags, tsk);
-
-	/* If we need to retry but a fatal signal is pending, handle the
-	 * signal first. We do not need to release the mmap_lock because
-	 * it would already be released in __lock_page_or_retry in
-	 * mm/filemap.c. */
-	if (fault_signal_pending(fault, regs))
-		return 0;
-
-	if (!(fault & VM_FAULT_ERROR) && (flags & FAULT_FLAG_ALLOW_RETRY)) {
-		if (fault & VM_FAULT_MAJOR)
-			tsk->maj_flt++;
-		else
-			tsk->min_flt++;
-		if (fault & VM_FAULT_RETRY) {
-			flags |= FAULT_FLAG_TRIED;
-			goto retry;
-		}
-	}
-
-	mmap_read_unlock(mm);
-
-	/*
-	 * Handle the "normal" case first - VM_FAULT_MAJOR
-	 */
-	if (likely(!(fault &
-	       (VM_FAULT_ERROR | VM_FAULT_BADMAP | VM_FAULT_BADACCESS))))
-		return 0;
-
-	/*
-	 * If we are in kernel mode at this point, we
-	 * have no context to handle this fault with.
-	 */
-	if (!user_mode(regs))
-		goto no_context;
-
-	if (fault & VM_FAULT_OOM) {
-		/*
-		 * We ran out of memory, call the OOM killer, and return to
-		 * userspace (which will retry the fault, or kill us if we
-		 * got oom-killed)
-		 */
-		pagefault_out_of_memory();
-		return 0;
-	}
-
-	if (fault & VM_FAULT_SIGBUS) {
-		/*
-		 * We had some memory, but were unable to
-		 * successfully fix up this page fault.
-		 */
-		sig = SIGBUS;
-		code = BUS_ADRERR;
-	} else {
-		/*
-		 * Something tried to access memory that
-		 * isn't in our memory map..
-		 */
-		sig = SIGSEGV;
-		code = fault == VM_FAULT_BADACCESS ? SEGV_ACCERR : SEGV_MAPERR;
-	}
-
-	__do_user_fault(addr, fsr, sig, code, regs);
-	return 0;
-
-no_context:
-	__do_kernel_fault(mm, addr, fsr, regs);
-	return 0;
-}
-
-/*
- * First Level Translation Fault Handler
- *
- * We enter here because the first level page table doesn't contain
- * a valid entry for the address.
- *
- * If the address is in kernel space (>= TASK_SIZE), then we are
- * probably faulting in the vmalloc() area.
- *
- * If the init_task's first level page tables contains the relevant
- * entry, we copy the it to this task.  If not, we send the process
- * a signal, fixup the exception, or oops the kernel.
- *
- * NOTE! We MUST NOT take any locks for this case. We may be in an
- * interrupt or a critical region, and should only copy the information
- * from the master page table, nothing more.
- */
-static int do_ifault(unsigned long addr, unsigned int fsr, struct pt_regs *regs)
-{
-	unsigned int index;
-	pgd_t *pgd, *pgd_k;
-	pmd_t *pmd, *pmd_k;
-
-	if (addr < TASK_SIZE)
-		return do_pf(addr, fsr, regs);
-
-	if (user_mode(regs))
-		goto bad_area;
-
-	index = pgd_index(addr);
-
-	pgd = cpu_get_pgd() + index;
-	pgd_k = init_mm.pgd + index;
-
-	if (pgd_none(*pgd_k))
-		goto bad_area;
-
-	pmd_k = pmd_offset((pud_t *) pgd_k, addr);
-	pmd = pmd_offset((pud_t *) pgd, addr);
-
-	if (pmd_none(*pmd_k))
-		goto bad_area;
-
-	set_pmd(pmd, *pmd_k);
-	flush_pmd_entry(pmd);
-	return 0;
-
-bad_area:
-	do_bad_area(addr, fsr, regs);
-	return 0;
-}
-
-/*
- * This abort handler always returns "fault".
- */
-static int do_bad(unsigned long addr, unsigned int fsr, struct pt_regs *regs)
-{
-	return 1;
-}
-
-static int do_good(unsigned long addr, unsigned int fsr, struct pt_regs *regs)
-{
-	unsigned int res1, res2;
-
-	printk("dabt exception but no error!\n");
-
-	__asm__ __volatile__(
-			"mff %0,f0\n"
-			"mff %1,f1\n"
-			: "=r"(res1), "=r"(res2)
-			:
-			: "memory");
-
-	printk(KERN_EMERG "r0 :%08x  r1 :%08x\n", res1, res2);
-	panic("shut up\n");
-	return 0;
-}
-
-static struct fsr_info {
-	int (*fn) (unsigned long addr, unsigned int fsr, struct pt_regs *regs);
-	int sig;
-	int code;
-	const char *name;
-} fsr_info[] = {
-	/*
-	 * The following are the standard Unicore-I and UniCore-II aborts.
-	 */
-	{ do_good,	SIGBUS,  0,		"no error"		},
-	{ do_bad,	SIGBUS,  BUS_ADRALN,	"alignment exception"	},
-	{ do_bad,	SIGBUS,  BUS_OBJERR,	"external exception"	},
-	{ do_bad,	SIGBUS,  0,		"burst operation"	},
-	{ do_bad,	SIGBUS,  0,		"unknown 00100"		},
-	{ do_ifault,	SIGSEGV, SEGV_MAPERR,	"2nd level pt non-exist"},
-	{ do_bad,	SIGBUS,  0,		"2nd lvl large pt non-exist" },
-	{ do_bad,	SIGBUS,  0,		"invalid pte"		},
-	{ do_pf,	SIGSEGV, SEGV_MAPERR,	"page miss"		},
-	{ do_bad,	SIGBUS,  0,		"middle page miss"	},
-	{ do_bad,	SIGBUS,	 0,		"large page miss"	},
-	{ do_pf,	SIGSEGV, SEGV_MAPERR,	"super page (section) miss" },
-	{ do_bad,	SIGBUS,  0,		"unknown 01100"		},
-	{ do_bad,	SIGBUS,  0,		"unknown 01101"		},
-	{ do_bad,	SIGBUS,  0,		"unknown 01110"		},
-	{ do_bad,	SIGBUS,  0,		"unknown 01111"		},
-	{ do_bad,	SIGBUS,  0,		"addr: up 3G or IO"	},
-	{ do_pf,	SIGSEGV, SEGV_ACCERR,	"read unreadable addr"	},
-	{ do_pf,	SIGSEGV, SEGV_ACCERR,	"write unwriteable addr"},
-	{ do_pf,	SIGSEGV, SEGV_ACCERR,	"exec unexecutable addr"},
-	{ do_bad,	SIGBUS,  0,		"unknown 10100"		},
-	{ do_bad,	SIGBUS,  0,		"unknown 10101"		},
-	{ do_bad,	SIGBUS,  0,		"unknown 10110"		},
-	{ do_bad,	SIGBUS,  0,		"unknown 10111"		},
-	{ do_bad,	SIGBUS,  0,		"unknown 11000"		},
-	{ do_bad,	SIGBUS,  0,		"unknown 11001"		},
-	{ do_bad,	SIGBUS,  0,		"unknown 11010"		},
-	{ do_bad,	SIGBUS,  0,		"unknown 11011"		},
-	{ do_bad,	SIGBUS,  0,		"unknown 11100"		},
-	{ do_bad,	SIGBUS,  0,		"unknown 11101"		},
-	{ do_bad,	SIGBUS,  0,		"unknown 11110"		},
-	{ do_bad,	SIGBUS,  0,		"unknown 11111"		}
-};
-
-void __init hook_fault_code(int nr,
-		int (*fn) (unsigned long, unsigned int, struct pt_regs *),
-		int sig, int code, const char *name)
-{
-	if (nr < 0 || nr >= ARRAY_SIZE(fsr_info))
-		BUG();
-
-	fsr_info[nr].fn   = fn;
-	fsr_info[nr].sig  = sig;
-	fsr_info[nr].code = code;
-	fsr_info[nr].name = name;
-}
-
-/*
- * Dispatch a data abort to the relevant handler.
- */
-asmlinkage void do_DataAbort(unsigned long addr, unsigned int fsr,
-			struct pt_regs *regs)
-{
-	const struct fsr_info *inf = fsr_info + fsr_fs(fsr);
-
-	if (!inf->fn(addr, fsr & ~FSR_LNX_PF, regs))
-		return;
-
-	printk(KERN_ALERT "Unhandled fault: %s (0x%03x) at 0x%08lx\n",
-	       inf->name, fsr, addr);
-
-	uc32_notify_die("", regs, inf->sig, inf->code, (void __user *)addr,
-			fsr, 0);
-}
-
-asmlinkage void do_PrefetchAbort(unsigned long addr,
-			unsigned int ifsr, struct pt_regs *regs)
-{
-	const struct fsr_info *inf = fsr_info + fsr_fs(ifsr);
-
-	if (!inf->fn(addr, ifsr | FSR_LNX_PF, regs))
-		return;
-
-	printk(KERN_ALERT "Unhandled prefetch abort: %s (0x%03x) at 0x%08lx\n",
-	       inf->name, ifsr, addr);
-
-	uc32_notify_die("", regs, inf->sig, inf->code, (void __user *)addr,
-			ifsr, 0);
-}
diff --git a/arch/unicore32/mm/flush.c b/arch/unicore32/mm/flush.c
deleted file mode 100644
index 65954f8d89a2..000000000000
--- a/arch/unicore32/mm/flush.c
+++ /dev/null
@@ -1,94 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * linux/arch/unicore32/mm/flush.c
- *
- * Code specific to PKUnity SoC and UniCore ISA
- *
- * Copyright (C) 2001-2010 GUAN Xue-tao
- */
-#include <linux/module.h>
-#include <linux/mm.h>
-#include <linux/pagemap.h>
-
-#include <asm/cacheflush.h>
-#include <asm/tlbflush.h>
-
-void flush_cache_mm(struct mm_struct *mm)
-{
-}
-
-void flush_cache_range(struct vm_area_struct *vma, unsigned long start,
-		unsigned long end)
-{
-	if (vma->vm_flags & VM_EXEC)
-		__flush_icache_all();
-}
-
-void flush_cache_page(struct vm_area_struct *vma, unsigned long user_addr,
-		unsigned long pfn)
-{
-}
-
-static void flush_ptrace_access(struct vm_area_struct *vma, struct page *page,
-			 unsigned long uaddr, void *kaddr, unsigned long len)
-{
-	/* VIPT non-aliasing D-cache */
-	if (vma->vm_flags & VM_EXEC) {
-		unsigned long addr = (unsigned long)kaddr;
-
-		__cpuc_coherent_kern_range(addr, addr + len);
-	}
-}
-
-/*
- * Copy user data from/to a page which is mapped into a different
- * processes address space.  Really, we want to allow our "user
- * space" model to handle this.
- *
- * Note that this code needs to run on the current CPU.
- */
-void copy_to_user_page(struct vm_area_struct *vma, struct page *page,
-		       unsigned long uaddr, void *dst, const void *src,
-		       unsigned long len)
-{
-	memcpy(dst, src, len);
-	flush_ptrace_access(vma, page, uaddr, dst, len);
-}
-
-void __flush_dcache_page(struct address_space *mapping, struct page *page)
-{
-	/*
-	 * Writeback any data associated with the kernel mapping of this
-	 * page.  This ensures that data in the physical page is mutually
-	 * coherent with the kernels mapping.
-	 */
-	__cpuc_flush_kern_dcache_area(page_address(page), PAGE_SIZE);
-}
-
-/*
- * Ensure cache coherency between kernel mapping and userspace mapping
- * of this page.
- */
-void flush_dcache_page(struct page *page)
-{
-	struct address_space *mapping;
-
-	/*
-	 * The zero page is never written to, so never has any dirty
-	 * cache lines, and therefore never needs to be flushed.
-	 */
-	if (page == ZERO_PAGE(0))
-		return;
-
-	mapping = page_mapping_file(page);
-
-	if (mapping && !mapping_mapped(mapping))
-		clear_bit(PG_dcache_clean, &page->flags);
-	else {
-		__flush_dcache_page(mapping, page);
-		if (mapping)
-			__flush_icache_all();
-		set_bit(PG_dcache_clean, &page->flags);
-	}
-}
-EXPORT_SYMBOL(flush_dcache_page);
diff --git a/arch/unicore32/mm/init.c b/arch/unicore32/mm/init.c
deleted file mode 100644
index 52425d383cea..000000000000
--- a/arch/unicore32/mm/init.c
+++ /dev/null
@@ -1,261 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- *  linux/arch/unicore32/mm/init.c
- *
- *  Copyright (C) 2010 GUAN Xue-tao
- */
-#include <linux/kernel.h>
-#include <linux/errno.h>
-#include <linux/swap.h>
-#include <linux/init.h>
-#include <linux/memblock.h>
-#include <linux/mman.h>
-#include <linux/nodemask.h>
-#include <linux/initrd.h>
-#include <linux/highmem.h>
-#include <linux/gfp.h>
-#include <linux/sort.h>
-#include <linux/dma-mapping.h>
-#include <linux/export.h>
-
-#include <asm/sections.h>
-#include <asm/setup.h>
-#include <linux/sizes.h>
-#include <asm/tlb.h>
-#include <asm/memblock.h>
-#include <mach/map.h>
-
-#include "mm.h"
-
-/*
- * This keeps memory configuration data used by a couple memory
- * initialization functions, as well as show_mem() for the skipping
- * of holes in the memory map.  It is populated by uc32_add_memory().
- */
-struct meminfo meminfo;
-
-static void __init find_limits(unsigned long *min, unsigned long *max_low,
-	unsigned long *max_high)
-{
-	struct meminfo *mi = &meminfo;
-	int i;
-
-	*min = -1UL;
-	*max_low = *max_high = 0;
-
-	for_each_bank(i, mi) {
-		struct membank *bank = &mi->bank[i];
-		unsigned long start, end;
-
-		start = bank_pfn_start(bank);
-		end = bank_pfn_end(bank);
-
-		if (*min > start)
-			*min = start;
-		if (*max_high < end)
-			*max_high = end;
-		if (bank->highmem)
-			continue;
-		if (*max_low < end)
-			*max_low = end;
-	}
-}
-
-static void __init uc32_bootmem_free(unsigned long max_low)
-{
-	unsigned long max_zone_pfn[MAX_NR_ZONES] = { 0 };
-
-	max_zone_pfn[ZONE_DMA] = max_low;
-	max_zone_pfn[ZONE_NORMAL] = max_low;
-
-	/*
-	 * Adjust the sizes according to any special requirements for
-	 * this machine type.
-	 * This might lower ZONE_DMA limit.
-	 */
-	arch_adjust_zones(max_zone_pfn);
-
-	free_area_init(max_zone_pfn);
-}
-
-int pfn_valid(unsigned long pfn)
-{
-	return memblock_is_memory(pfn << PAGE_SHIFT);
-}
-EXPORT_SYMBOL(pfn_valid);
-
-static void uc32_memory_present(void)
-{
-}
-
-static int __init meminfo_cmp(const void *_a, const void *_b)
-{
-	const struct membank *a = _a, *b = _b;
-	long cmp = bank_pfn_start(a) - bank_pfn_start(b);
-	return cmp < 0 ? -1 : cmp > 0 ? 1 : 0;
-}
-
-void __init uc32_memblock_init(struct meminfo *mi)
-{
-	int i;
-
-	sort(&meminfo.bank, meminfo.nr_banks, sizeof(meminfo.bank[0]),
-		meminfo_cmp, NULL);
-
-	for (i = 0; i < mi->nr_banks; i++)
-		memblock_add(mi->bank[i].start, mi->bank[i].size);
-
-	/* Register the kernel text, kernel data and initrd with memblock. */
-	memblock_reserve(__pa(_text), _end - _text);
-
-#ifdef CONFIG_BLK_DEV_INITRD
-	if (!phys_initrd_size) {
-		phys_initrd_start = 0x01000000;
-		phys_initrd_size = SZ_8M;
-	}
-
-	if (phys_initrd_size) {
-		memblock_reserve(phys_initrd_start, phys_initrd_size);
-
-		/* Now convert initrd to virtual addresses */
-		initrd_start = __phys_to_virt(phys_initrd_start);
-		initrd_end = initrd_start + phys_initrd_size;
-	}
-#endif
-
-	uc32_mm_memblock_reserve();
-
-	memblock_allow_resize();
-	memblock_dump_all();
-}
-
-void __init bootmem_init(void)
-{
-	unsigned long min, max_low, max_high;
-
-	max_low = max_high = 0;
-
-	find_limits(&min, &max_low, &max_high);
-
-	node_set_online(0);
-
-	/*
-	 * Sparsemem tries to allocate bootmem in memory_present(),
-	 * so must be done after the fixed reservations
-	 */
-	uc32_memory_present();
-
-	/*
-	 * sparse_init() needs the bootmem allocator up and running.
-	 */
-	sparse_init();
-
-	/*
-	 * Now free the memory - free_area_init needs
-	 * the sparse mem_map arrays initialized by sparse_init()
-	 * for memmap_init_zone(), otherwise all PFNs are invalid.
-	 */
-	uc32_bootmem_free(max_low);
-
-	high_memory = __va((max_low << PAGE_SHIFT) - 1) + 1;
-
-	/*
-	 * This doesn't seem to be used by the Linux memory manager any
-	 * more, but is used by ll_rw_block.  If we can get rid of it, we
-	 * also get rid of some of the stuff above as well.
-	 *
-	 * Note: max_low_pfn and max_pfn reflect the number of _pages_ in
-	 * the system, not the maximum PFN.
-	 */
-	max_low_pfn = max_low - PHYS_PFN_OFFSET;
-	max_pfn = max_high - PHYS_PFN_OFFSET;
-}
-
-static inline void
-free_memmap(unsigned long start_pfn, unsigned long end_pfn)
-{
-	struct page *start_pg, *end_pg;
-	unsigned long pg, pgend;
-
-	/*
-	 * Convert start_pfn/end_pfn to a struct page pointer.
-	 */
-	start_pg = pfn_to_page(start_pfn - 1) + 1;
-	end_pg = pfn_to_page(end_pfn);
-
-	/*
-	 * Convert to physical addresses, and
-	 * round start upwards and end downwards.
-	 */
-	pg = PAGE_ALIGN(__pa(start_pg));
-	pgend = __pa(end_pg) & PAGE_MASK;
-
-	/*
-	 * If there are free pages between these,
-	 * free the section of the memmap array.
-	 */
-	if (pg < pgend)
-		memblock_free(pg, pgend - pg);
-}
-
-/*
- * The mem_map array can get very big.  Free the unused area of the memory map.
- */
-static void __init free_unused_memmap(struct meminfo *mi)
-{
-	unsigned long bank_start, prev_bank_end = 0;
-	unsigned int i;
-
-	/*
-	 * This relies on each bank being in address order.
-	 * The banks are sorted previously in bootmem_init().
-	 */
-	for_each_bank(i, mi) {
-		struct membank *bank = &mi->bank[i];
-
-		bank_start = bank_pfn_start(bank);
-
-		/*
-		 * If we had a previous bank, and there is a space
-		 * between the current bank and the previous, free it.
-		 */
-		if (prev_bank_end && prev_bank_end < bank_start)
-			free_memmap(prev_bank_end, bank_start);
-
-		/*
-		 * Align up here since the VM subsystem insists that the
-		 * memmap entries are valid from the bank end aligned to
-		 * MAX_ORDER_NR_PAGES.
-		 */
-		prev_bank_end = ALIGN(bank_pfn_end(bank), MAX_ORDER_NR_PAGES);
-	}
-}
-
-/*
- * mem_init() marks the free areas in the mem_map and tells us how much
- * memory is free.  This is done after various parts of the system have
- * claimed their memory after the kernel image.
- */
-void __init mem_init(void)
-{
-	max_mapnr   = pfn_to_page(max_pfn + PHYS_PFN_OFFSET) - mem_map;
-
-	free_unused_memmap(&meminfo);
-
-	/* this will put all unused low memory onto the freelists */
-	memblock_free_all();
-
-	mem_init_print_info(NULL);
-
-	BUILD_BUG_ON(TASK_SIZE				> MODULES_VADDR);
-	BUG_ON(TASK_SIZE				> MODULES_VADDR);
-
-	if (PAGE_SIZE >= 16384 && get_num_physpages() <= 128) {
-		/*
-		 * On a machine this small we won't get
-		 * anywhere without overcommit, so turn
-		 * it on by default.
-		 */
-		sysctl_overcommit_memory = OVERCOMMIT_ALWAYS;
-	}
-}
diff --git a/arch/unicore32/mm/ioremap.c b/arch/unicore32/mm/ioremap.c
deleted file mode 100644
index 46a64bd6156a..000000000000
--- a/arch/unicore32/mm/ioremap.c
+++ /dev/null
@@ -1,242 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * linux/arch/unicore32/mm/ioremap.c
- *
- * Code specific to PKUnity SoC and UniCore ISA
- *
- * Copyright (C) 2001-2010 GUAN Xue-tao
- *
- * Re-map IO memory to kernel address space so that we can access it.
- *
- * This allows a driver to remap an arbitrary region of bus memory into
- * virtual space.  One should *only* use readl, writel, memcpy_toio and
- * so on with such remapped areas.
- *
- * Because UniCore only has a 32-bit address space we can't address the
- * whole of the (physical) PCI space at once.  PCI huge-mode addressing
- * allows us to circumvent this restriction by splitting PCI space into
- * two 2GB chunks and mapping only one at a time into processor memory.
- * We use MMU protection domains to trap any attempt to access the bank
- * that is not currently mapped.  (This isn't fully implemented yet.)
- */
-#include <linux/module.h>
-#include <linux/errno.h>
-#include <linux/mm.h>
-#include <linux/vmalloc.h>
-#include <linux/io.h>
-
-#include <asm/cputype.h>
-#include <asm/cacheflush.h>
-#include <asm/mmu_context.h>
-#include <asm/pgalloc.h>
-#include <asm/tlbflush.h>
-#include <linux/sizes.h>
-
-#include <mach/map.h>
-#include "mm.h"
-
-/*
- * Used by ioremap() and iounmap() code to mark (super)section-mapped
- * I/O regions in vm_struct->flags field.
- */
-#define VM_UNICORE_SECTION_MAPPING	0x80000000
-
-int ioremap_page(unsigned long virt, unsigned long phys,
-		 const struct mem_type *mtype)
-{
-	return ioremap_page_range(virt, virt + PAGE_SIZE, phys,
-				  __pgprot(mtype->prot_pte));
-}
-EXPORT_SYMBOL(ioremap_page);
-
-/*
- * Section support is unsafe on SMP - If you iounmap and ioremap a region,
- * the other CPUs will not see this change until their next context switch.
- * Meanwhile, (eg) if an interrupt comes in on one of those other CPUs
- * which requires the new ioremap'd region to be referenced, the CPU will
- * reference the _old_ region.
- *
- * Note that get_vm_area_caller() allocates a guard 4K page, so we need to
- * mask the size back to 4MB aligned or we will overflow in the loop below.
- */
-static void unmap_area_sections(unsigned long virt, unsigned long size)
-{
-	unsigned long addr = virt, end = virt + (size & ~(SZ_4M - 1));
-	pgd_t *pgd;
-
-	flush_cache_vunmap(addr, end);
-	pgd = pgd_offset_k(addr);
-	do {
-		pmd_t pmd, *pmdp = pmd_offset((pud_t *)pgd, addr);
-
-		pmd = *pmdp;
-		if (!pmd_none(pmd)) {
-			/*
-			 * Clear the PMD from the page table, and
-			 * increment the kvm sequence so others
-			 * notice this change.
-			 *
-			 * Note: this is still racy on SMP machines.
-			 */
-			pmd_clear(pmdp);
-
-			/*
-			 * Free the page table, if there was one.
-			 */
-			if ((pmd_val(pmd) & PMD_TYPE_MASK) == PMD_TYPE_TABLE)
-				pte_free_kernel(&init_mm, pmd_page_vaddr(pmd));
-		}
-
-		addr += PGDIR_SIZE;
-		pgd++;
-	} while (addr < end);
-
-	flush_tlb_kernel_range(virt, end);
-}
-
-static int
-remap_area_sections(unsigned long virt, unsigned long pfn,
-		    size_t size, const struct mem_type *type)
-{
-	unsigned long addr = virt, end = virt + size;
-	pgd_t *pgd;
-
-	/*
-	 * Remove and free any PTE-based mapping, and
-	 * sync the current kernel mapping.
-	 */
-	unmap_area_sections(virt, size);
-
-	pgd = pgd_offset_k(addr);
-	do {
-		pmd_t *pmd = pmd_offset((pud_t *)pgd, addr);
-
-		set_pmd(pmd, __pmd(__pfn_to_phys(pfn) | type->prot_sect));
-		pfn += SZ_4M >> PAGE_SHIFT;
-		flush_pmd_entry(pmd);
-
-		addr += PGDIR_SIZE;
-		pgd++;
-	} while (addr < end);
-
-	return 0;
-}
-
-void __iomem *__uc32_ioremap_pfn_caller(unsigned long pfn,
-	unsigned long offset, size_t size, unsigned int mtype, void *caller)
-{
-	const struct mem_type *type;
-	int err;
-	unsigned long addr;
-	struct vm_struct *area;
-
-	/*
-	 * High mappings must be section aligned
-	 */
-	if (pfn >= 0x100000 && (__pfn_to_phys(pfn) & ~SECTION_MASK))
-		return NULL;
-
-	/*
-	 * Don't allow RAM to be mapped
-	 */
-	if (pfn_valid(pfn)) {
-		WARN(1, "BUG: Your driver calls ioremap() on\n"
-			"system memory.  This leads to architecturally\n"
-			"unpredictable behaviour, and ioremap() will fail in\n"
-			"the next kernel release. Please fix your driver.\n");
-		return NULL;
-	}
-
-	type = get_mem_type(mtype);
-	if (!type)
-		return NULL;
-
-	/*
-	 * Page align the mapping size, taking account of any offset.
-	 */
-	size = PAGE_ALIGN(offset + size);
-
-	area = get_vm_area_caller(size, VM_IOREMAP, caller);
-	if (!area)
-		return NULL;
-	addr = (unsigned long)area->addr;
-
-	if (!((__pfn_to_phys(pfn) | size | addr) & ~PMD_MASK)) {
-		area->flags |= VM_UNICORE_SECTION_MAPPING;
-		err = remap_area_sections(addr, pfn, size, type);
-	} else
-		err = ioremap_page_range(addr, addr + size, __pfn_to_phys(pfn),
-					 __pgprot(type->prot_pte));
-
-	if (err) {
-		vunmap((void *)addr);
-		return NULL;
-	}
-
-	flush_cache_vmap(addr, addr + size);
-	return (void __iomem *) (offset + addr);
-}
-
-void __iomem *__uc32_ioremap_caller(unsigned long phys_addr, size_t size,
-	unsigned int mtype, void *caller)
-{
-	unsigned long last_addr;
-	unsigned long offset = phys_addr & ~PAGE_MASK;
-	unsigned long pfn = __phys_to_pfn(phys_addr);
-
-	/*
-	 * Don't allow wraparound or zero size
-	 */
-	last_addr = phys_addr + size - 1;
-	if (!size || last_addr < phys_addr)
-		return NULL;
-
-	return __uc32_ioremap_pfn_caller(pfn, offset, size, mtype, caller);
-}
-
-/*
- * Remap an arbitrary physical address space into the kernel virtual
- * address space. Needed when the kernel wants to access high addresses
- * directly.
- *
- * NOTE! We need to allow non-page-aligned mappings too: we will obviously
- * have to convert them into an offset in a page-aligned mapping, but the
- * caller shouldn't need to know that small detail.
- */
-void __iomem *
-__uc32_ioremap_pfn(unsigned long pfn, unsigned long offset, size_t size,
-		  unsigned int mtype)
-{
-	return __uc32_ioremap_pfn_caller(pfn, offset, size, mtype,
-			__builtin_return_address(0));
-}
-EXPORT_SYMBOL(__uc32_ioremap_pfn);
-
-void __iomem *
-__uc32_ioremap(unsigned long phys_addr, size_t size)
-{
-	return __uc32_ioremap_caller(phys_addr, size, MT_DEVICE,
-			__builtin_return_address(0));
-}
-EXPORT_SYMBOL(__uc32_ioremap);
-
-void __uc32_iounmap(volatile void __iomem *io_addr)
-{
-	void *addr = (void *)(PAGE_MASK & (unsigned long)io_addr);
-	struct vm_struct *vm;
-
-	/*
-	 * If this is a section based mapping we need to handle it
-	 * specially as the VM subsystem does not know how to handle
-	 * such a beast. We need the lock here b/c we need to clear
-	 * all the mappings before the area can be reclaimed
-	 * by someone else.
-	 */
-	vm = find_vm_area(addr);
-	if (vm && (vm->flags & VM_IOREMAP) &&
-		(vm->flags & VM_UNICORE_SECTION_MAPPING))
-		unmap_area_sections((unsigned long)vm->addr, vm->size);
-
-	vunmap(addr);
-}
-EXPORT_SYMBOL(__uc32_iounmap);
diff --git a/arch/unicore32/mm/mm.h b/arch/unicore32/mm/mm.h
deleted file mode 100644
index f157f5d249ab..000000000000
--- a/arch/unicore32/mm/mm.h
+++ /dev/null
@@ -1,31 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/*
- * linux/arch/unicore32/mm/mm.h
- *
- * Code specific to PKUnity SoC and UniCore ISA
- *
- * Copyright (C) 2001-2010 GUAN Xue-tao
- */
-#include <asm/hwdef-copro.h>
-
-/* the upper-most page table pointer */
-extern pmd_t *top_pmd;
-extern int sysctl_overcommit_memory;
-
-#define TOP_PTE(x)	pte_offset_kernel(top_pmd, x)
-
-struct mem_type {
-	unsigned int prot_pte;
-	unsigned int prot_l1;
-	unsigned int prot_sect;
-};
-
-const struct mem_type *get_mem_type(unsigned int type);
-
-extern void __flush_dcache_page(struct address_space *, struct page *);
-extern void hook_fault_code(int nr, int (*fn)
-		(unsigned long, unsigned int, struct pt_regs *),
-		int sig, int code, const char *name);
-
-void __init bootmem_init(void);
-void uc32_mm_memblock_reserve(void);
diff --git a/arch/unicore32/mm/mmu.c b/arch/unicore32/mm/mmu.c
deleted file mode 100644
index 183d5b056814..000000000000
--- a/arch/unicore32/mm/mmu.c
+++ /dev/null
@@ -1,513 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * linux/arch/unicore32/mm/mmu.c
- *
- * Code specific to PKUnity SoC and UniCore ISA
- *
- * Copyright (C) 2001-2010 GUAN Xue-tao
- */
-#include <linux/module.h>
-#include <linux/kernel.h>
-#include <linux/errno.h>
-#include <linux/init.h>
-#include <linux/mman.h>
-#include <linux/nodemask.h>
-#include <linux/memblock.h>
-#include <linux/fs.h>
-#include <linux/io.h>
-
-#include <asm/cputype.h>
-#include <asm/sections.h>
-#include <asm/setup.h>
-#include <linux/sizes.h>
-#include <asm/tlb.h>
-#include <asm/memblock.h>
-
-#include <mach/map.h>
-
-#include "mm.h"
-
-/*
- * empty_zero_page is a special page that is used for
- * zero-initialized data and COW.
- */
-struct page *empty_zero_page;
-EXPORT_SYMBOL(empty_zero_page);
-
-/*
- * The pmd table for the upper-most set of pages.
- */
-pmd_t *top_pmd;
-
-pgprot_t pgprot_user;
-EXPORT_SYMBOL(pgprot_user);
-
-pgprot_t pgprot_kernel;
-EXPORT_SYMBOL(pgprot_kernel);
-
-static int __init noalign_setup(char *__unused)
-{
-	cr_alignment &= ~CR_A;
-	cr_no_alignment &= ~CR_A;
-	set_cr(cr_alignment);
-	return 1;
-}
-__setup("noalign", noalign_setup);
-
-void adjust_cr(unsigned long mask, unsigned long set)
-{
-	unsigned long flags;
-
-	mask &= ~CR_A;
-
-	set &= mask;
-
-	local_irq_save(flags);
-
-	cr_no_alignment = (cr_no_alignment & ~mask) | set;
-	cr_alignment = (cr_alignment & ~mask) | set;
-
-	set_cr((get_cr() & ~mask) | set);
-
-	local_irq_restore(flags);
-}
-
-struct map_desc {
-	unsigned long virtual;
-	unsigned long pfn;
-	unsigned long length;
-	unsigned int type;
-};
-
-#define PROT_PTE_DEVICE		(PTE_PRESENT | PTE_YOUNG |	\
-				PTE_DIRTY | PTE_READ | PTE_WRITE)
-#define PROT_SECT_DEVICE	(PMD_TYPE_SECT | PMD_PRESENT |	\
-				PMD_SECT_READ | PMD_SECT_WRITE)
-
-static struct mem_type mem_types[] = {
-	[MT_DEVICE] = {		  /* Strongly ordered */
-		.prot_pte	= PROT_PTE_DEVICE,
-		.prot_l1	= PMD_TYPE_TABLE | PMD_PRESENT,
-		.prot_sect	= PROT_SECT_DEVICE,
-	},
-	/*
-	 * MT_KUSER: pte for vecpage -- cacheable,
-	 *       and sect for unigfx mmap -- noncacheable
-	 */
-	[MT_KUSER] = {
-		.prot_pte  = PTE_PRESENT | PTE_YOUNG | PTE_DIRTY |
-				PTE_CACHEABLE | PTE_READ | PTE_EXEC,
-		.prot_l1   = PMD_TYPE_TABLE | PMD_PRESENT,
-		.prot_sect = PROT_SECT_DEVICE,
-	},
-	[MT_HIGH_VECTORS] = {
-		.prot_pte  = PTE_PRESENT | PTE_YOUNG | PTE_DIRTY |
-				PTE_CACHEABLE | PTE_READ | PTE_WRITE |
-				PTE_EXEC,
-		.prot_l1   = PMD_TYPE_TABLE | PMD_PRESENT,
-	},
-	[MT_MEMORY] = {
-		.prot_pte  = PTE_PRESENT | PTE_YOUNG | PTE_DIRTY |
-				PTE_WRITE | PTE_EXEC,
-		.prot_l1   = PMD_TYPE_TABLE | PMD_PRESENT,
-		.prot_sect = PMD_TYPE_SECT | PMD_PRESENT | PMD_SECT_CACHEABLE |
-				PMD_SECT_READ | PMD_SECT_WRITE | PMD_SECT_EXEC,
-	},
-	[MT_ROM] = {
-		.prot_sect = PMD_TYPE_SECT | PMD_PRESENT | PMD_SECT_CACHEABLE |
-				PMD_SECT_READ,
-	},
-};
-
-const struct mem_type *get_mem_type(unsigned int type)
-{
-	return type < ARRAY_SIZE(mem_types) ? &mem_types[type] : NULL;
-}
-EXPORT_SYMBOL(get_mem_type);
-
-/*
- * Adjust the PMD section entries according to the CPU in use.
- */
-static void __init build_mem_type_table(void)
-{
-	pgprot_user   = __pgprot(PTE_PRESENT | PTE_YOUNG | PTE_CACHEABLE);
-	pgprot_kernel = __pgprot(PTE_PRESENT | PTE_YOUNG |
-				 PTE_DIRTY | PTE_READ | PTE_WRITE |
-				 PTE_EXEC | PTE_CACHEABLE);
-}
-
-#define vectors_base()	(vectors_high() ? 0xffff0000 : 0)
-
-static pte_t * __init early_pte_alloc(pmd_t *pmd, unsigned long addr,
-		unsigned long prot)
-{
-	if (pmd_none(*pmd)) {
-		size_t size = PTRS_PER_PTE * sizeof(pte_t);
-		pte_t *pte = memblock_alloc(size, size);
-
-		if (!pte)
-			panic("%s: Failed to allocate %zu bytes align=%zx\n",
-			      __func__, size, size);
-
-		__pmd_populate(pmd, __pa(pte) | prot);
-	}
-	BUG_ON(pmd_bad(*pmd));
-	return pte_offset_kernel(pmd, addr);
-}
-
-static void __init alloc_init_pte(pmd_t *pmd, unsigned long addr,
-				  unsigned long end, unsigned long pfn,
-				  const struct mem_type *type)
-{
-	pte_t *pte = early_pte_alloc(pmd, addr, type->prot_l1);
-	do {
-		set_pte(pte, pfn_pte(pfn, __pgprot(type->prot_pte)));
-		pfn++;
-	} while (pte++, addr += PAGE_SIZE, addr != end);
-}
-
-static void __init alloc_init_section(pgd_t *pgd, unsigned long addr,
-				      unsigned long end, unsigned long phys,
-				      const struct mem_type *type)
-{
-	pmd_t *pmd = pmd_offset((pud_t *)pgd, addr);
-
-	/*
-	 * Try a section mapping - end, addr and phys must all be aligned
-	 * to a section boundary.
-	 */
-	if (((addr | end | phys) & ~SECTION_MASK) == 0) {
-		pmd_t *p = pmd;
-
-		do {
-			set_pmd(pmd, __pmd(phys | type->prot_sect));
-			phys += SECTION_SIZE;
-		} while (pmd++, addr += SECTION_SIZE, addr != end);
-
-		flush_pmd_entry(p);
-	} else {
-		/*
-		 * No need to loop; pte's aren't interested in the
-		 * individual L1 entries.
-		 */
-		alloc_init_pte(pmd, addr, end, __phys_to_pfn(phys), type);
-	}
-}
-
-/*
- * Create the page directory entries and any necessary
- * page tables for the mapping specified by `md'.  We
- * are able to cope here with varying sizes and address
- * offsets, and we take full advantage of sections.
- */
-static void __init create_mapping(struct map_desc *md)
-{
-	unsigned long phys, addr, length, end;
-	const struct mem_type *type;
-	pgd_t *pgd;
-
-	if (md->virtual != vectors_base() && md->virtual < TASK_SIZE) {
-		printk(KERN_WARNING "BUG: not creating mapping for "
-		       "0x%08llx at 0x%08lx in user region\n",
-		       __pfn_to_phys((u64)md->pfn), md->virtual);
-		return;
-	}
-
-	if ((md->type == MT_DEVICE || md->type == MT_ROM) &&
-	    md->virtual >= PAGE_OFFSET && md->virtual < VMALLOC_END) {
-		printk(KERN_WARNING "BUG: mapping for 0x%08llx at 0x%08lx "
-		       "overlaps vmalloc space\n",
-		       __pfn_to_phys((u64)md->pfn), md->virtual);
-	}
-
-	type = &mem_types[md->type];
-
-	addr = md->virtual & PAGE_MASK;
-	phys = (unsigned long)__pfn_to_phys(md->pfn);
-	length = PAGE_ALIGN(md->length + (md->virtual & ~PAGE_MASK));
-
-	if (type->prot_l1 == 0 && ((addr | phys | length) & ~SECTION_MASK)) {
-		printk(KERN_WARNING "BUG: map for 0x%08lx at 0x%08lx can not "
-		       "be mapped using pages, ignoring.\n",
-		       __pfn_to_phys(md->pfn), addr);
-		return;
-	}
-
-	pgd = pgd_offset_k(addr);
-	end = addr + length;
-	do {
-		unsigned long next = pgd_addr_end(addr, end);
-
-		alloc_init_section(pgd, addr, next, phys, type);
-
-		phys += next - addr;
-		addr = next;
-	} while (pgd++, addr != end);
-}
-
-static void * __initdata vmalloc_min = (void *)(VMALLOC_END - SZ_128M);
-
-/*
- * vmalloc=size forces the vmalloc area to be exactly 'size'
- * bytes. This can be used to increase (or decrease) the vmalloc
- * area - the default is 128m.
- */
-static int __init early_vmalloc(char *arg)
-{
-	unsigned long vmalloc_reserve = memparse(arg, NULL);
-
-	if (vmalloc_reserve < SZ_16M) {
-		vmalloc_reserve = SZ_16M;
-		printk(KERN_WARNING
-			"vmalloc area too small, limiting to %luMB\n",
-			vmalloc_reserve >> 20);
-	}
-
-	if (vmalloc_reserve > VMALLOC_END - (PAGE_OFFSET + SZ_32M)) {
-		vmalloc_reserve = VMALLOC_END - (PAGE_OFFSET + SZ_32M);
-		printk(KERN_WARNING
-			"vmalloc area is too big, limiting to %luMB\n",
-			vmalloc_reserve >> 20);
-	}
-
-	vmalloc_min = (void *)(VMALLOC_END - vmalloc_reserve);
-	return 0;
-}
-early_param("vmalloc", early_vmalloc);
-
-static phys_addr_t lowmem_limit __initdata = SZ_1G;
-
-static void __init sanity_check_meminfo(void)
-{
-	int i, j;
-
-	lowmem_limit = __pa(vmalloc_min - 1) + 1;
-	memblock_set_current_limit(lowmem_limit);
-
-	for (i = 0, j = 0; i < meminfo.nr_banks; i++) {
-		struct membank *bank = &meminfo.bank[j];
-		*bank = meminfo.bank[i];
-		j++;
-	}
-	meminfo.nr_banks = j;
-}
-
-static inline void prepare_page_table(void)
-{
-	unsigned long addr;
-	phys_addr_t end;
-
-	/*
-	 * Clear out all the mappings below the kernel image.
-	 */
-	for (addr = 0; addr < MODULES_VADDR; addr += PGDIR_SIZE)
-		pmd_clear(pmd_off_k(addr));
-
-	for ( ; addr < PAGE_OFFSET; addr += PGDIR_SIZE)
-		pmd_clear(pmd_off_k(addr));
-
-	/*
-	 * Find the end of the first block of lowmem.
-	 */
-	end = memblock.memory.regions[0].base + memblock.memory.regions[0].size;
-	if (end >= lowmem_limit)
-		end = lowmem_limit;
-
-	/*
-	 * Clear out all the kernel space mappings, except for the first
-	 * memory bank, up to the end of the vmalloc region.
-	 */
-	for (addr = __phys_to_virt(end);
-	     addr < VMALLOC_END; addr += PGDIR_SIZE)
-		pmd_clear(pmd_off_k(addr));
-}
-
-/*
- * Reserve the special regions of memory
- */
-void __init uc32_mm_memblock_reserve(void)
-{
-	/*
-	 * Reserve the page tables.  These are already in use,
-	 * and can only be in node 0.
-	 */
-	memblock_reserve(__pa(swapper_pg_dir), PTRS_PER_PGD * sizeof(pgd_t));
-}
-
-/*
- * Set up device the mappings.  Since we clear out the page tables for all
- * mappings above VMALLOC_END, we will remove any debug device mappings.
- * This means you have to be careful how you debug this function, or any
- * called function.  This means you can't use any function or debugging
- * method which may touch any device, otherwise the kernel _will_ crash.
- */
-static void __init devicemaps_init(void)
-{
-	struct map_desc map;
-	unsigned long addr;
-	void *vectors;
-
-	/*
-	 * Allocate the vector page early.
-	 */
-	vectors = memblock_alloc(PAGE_SIZE, PAGE_SIZE);
-	if (!vectors)
-		panic("%s: Failed to allocate %lu bytes align=0x%lx\n",
-		      __func__, PAGE_SIZE, PAGE_SIZE);
-
-	for (addr = VMALLOC_END; addr; addr += PGDIR_SIZE)
-		pmd_clear(pmd_off_k(addr));
-
-	/*
-	 * Create a mapping for the machine vectors at the high-vectors
-	 * location (0xffff0000).  If we aren't using high-vectors, also
-	 * create a mapping at the low-vectors virtual address.
-	 */
-	map.pfn = __phys_to_pfn(virt_to_phys(vectors));
-	map.virtual = VECTORS_BASE;
-	map.length = PAGE_SIZE;
-	map.type = MT_HIGH_VECTORS;
-	create_mapping(&map);
-
-	/*
-	 * Create a mapping for the kuser page at the special
-	 * location (0xbfff0000) to the same vectors location.
-	 */
-	map.pfn = __phys_to_pfn(virt_to_phys(vectors));
-	map.virtual = KUSER_VECPAGE_BASE;
-	map.length = PAGE_SIZE;
-	map.type = MT_KUSER;
-	create_mapping(&map);
-
-	/*
-	 * Finally flush the caches and tlb to ensure that we're in a
-	 * consistent state wrt the writebuffer.  This also ensures that
-	 * any write-allocated cache lines in the vector page are written
-	 * back.  After this point, we can start to touch devices again.
-	 */
-	local_flush_tlb_all();
-	flush_cache_all();
-}
-
-static void __init map_lowmem(void)
-{
-	struct memblock_region *reg;
-
-	/* Map all the lowmem memory banks. */
-	for_each_memblock(memory, reg) {
-		phys_addr_t start = reg->base;
-		phys_addr_t end = start + reg->size;
-		struct map_desc map;
-
-		if (end > lowmem_limit)
-			end = lowmem_limit;
-		if (start >= end)
-			break;
-
-		map.pfn = __phys_to_pfn(start);
-		map.virtual = __phys_to_virt(start);
-		map.length = end - start;
-		map.type = MT_MEMORY;
-
-		create_mapping(&map);
-	}
-}
-
-/*
- * paging_init() sets up the page tables, initialises the zone memory
- * maps, and sets up the zero page, bad page and bad page tables.
- */
-void __init paging_init(void)
-{
-	void *zero_page;
-
-	build_mem_type_table();
-	sanity_check_meminfo();
-	prepare_page_table();
-	map_lowmem();
-	devicemaps_init();
-
-	top_pmd = pmd_off_k(0xffff0000);
-
-	/* allocate the zero page. */
-	zero_page = memblock_alloc(PAGE_SIZE, PAGE_SIZE);
-	if (!zero_page)
-		panic("%s: Failed to allocate %lu bytes align=0x%lx\n",
-		      __func__, PAGE_SIZE, PAGE_SIZE);
-
-	bootmem_init();
-
-	empty_zero_page = virt_to_page(zero_page);
-	__flush_dcache_page(NULL, empty_zero_page);
-}
-
-/*
- * In order to soft-boot, we need to insert a 1:1 mapping in place of
- * the user-mode pages.  This will then ensure that we have predictable
- * results when turning the mmu off
- */
-void setup_mm_for_reboot(void)
-{
-	unsigned long base_pmdval;
-	pgd_t *pgd;
-	int i;
-
-	/*
-	 * We need to access to user-mode page tables here. For kernel threads
-	 * we don't have any user-mode mappings so we use the context that we
-	 * "borrowed".
-	 */
-	pgd = current->active_mm->pgd;
-
-	base_pmdval = PMD_SECT_WRITE | PMD_SECT_READ | PMD_TYPE_SECT;
-
-	for (i = 0; i < FIRST_USER_PGD_NR + USER_PTRS_PER_PGD; i++, pgd++) {
-		unsigned long pmdval = (i << PGDIR_SHIFT) | base_pmdval;
-		pmd_t *pmd;
-
-		pmd = pmd_off(pgd, i << PGDIR_SHIFT);
-		set_pmd(pmd, __pmd(pmdval));
-		flush_pmd_entry(pmd);
-	}
-
-	local_flush_tlb_all();
-}
-
-/*
- * Take care of architecture specific things when placing a new PTE into
- * a page table, or changing an existing PTE.  Basically, there are two
- * things that we need to take care of:
- *
- *  1. If PG_dcache_clean is not set for the page, we need to ensure
- *     that any cache entries for the kernels virtual memory
- *     range are written back to the page.
- *  2. If we have multiple shared mappings of the same space in
- *     an object, we need to deal with the cache aliasing issues.
- *
- * Note that the pte lock will be held.
- */
-void update_mmu_cache(struct vm_area_struct *vma, unsigned long addr,
-	pte_t *ptep)
-{
-	unsigned long pfn = pte_pfn(*ptep);
-	struct address_space *mapping;
-	struct page *page;
-
-	if (!pfn_valid(pfn))
-		return;
-
-	/*
-	 * The zero page is never written to, so never has any dirty
-	 * cache lines, and therefore never needs to be flushed.
-	 */
-	page = pfn_to_page(pfn);
-	if (page == ZERO_PAGE(0))
-		return;
-
-	mapping = page_mapping_file(page);
-	if (!test_and_set_bit(PG_dcache_clean, &page->flags))
-		__flush_dcache_page(mapping, page);
-	if (mapping)
-		if (vma->vm_flags & VM_EXEC)
-			__flush_icache_all();
-}
diff --git a/arch/unicore32/mm/pgd.c b/arch/unicore32/mm/pgd.c
deleted file mode 100644
index f01c73e04836..000000000000
--- a/arch/unicore32/mm/pgd.c
+++ /dev/null
@@ -1,102 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * linux/arch/unicore32/mm/pgd.c
- *
- * Code specific to PKUnity SoC and UniCore ISA
- *
- * Copyright (C) 2001-2010 GUAN Xue-tao
- */
-#include <linux/mm.h>
-#include <linux/gfp.h>
-#include <linux/highmem.h>
-
-#include <asm/pgalloc.h>
-#include <asm/page.h>
-#include <asm/tlbflush.h>
-
-#include "mm.h"
-
-#define FIRST_KERNEL_PGD_NR	(FIRST_USER_PGD_NR + USER_PTRS_PER_PGD)
-
-/*
- * need to get a 4k page for level 1
- */
-pgd_t *get_pgd_slow(struct mm_struct *mm)
-{
-	pgd_t *new_pgd, *init_pgd;
-	pmd_t *new_pmd, *init_pmd;
-	pte_t *new_pte, *init_pte;
-
-	new_pgd = (pgd_t *)__get_free_pages(GFP_KERNEL, 0);
-	if (!new_pgd)
-		goto no_pgd;
-
-	memset(new_pgd, 0, FIRST_KERNEL_PGD_NR * sizeof(pgd_t));
-
-	/*
-	 * Copy over the kernel and IO PGD entries
-	 */
-	init_pgd = pgd_offset_k(0);
-	memcpy(new_pgd + FIRST_KERNEL_PGD_NR, init_pgd + FIRST_KERNEL_PGD_NR,
-		       (PTRS_PER_PGD - FIRST_KERNEL_PGD_NR) * sizeof(pgd_t));
-
-	clean_dcache_area(new_pgd, PTRS_PER_PGD * sizeof(pgd_t));
-
-	if (!vectors_high()) {
-		/*
-		 * On UniCore, first page must always be allocated since it
-		 * contains the machine vectors.
-		 */
-		new_pmd = pmd_alloc(mm, (pud_t *)new_pgd, 0);
-		if (!new_pmd)
-			goto no_pmd;
-
-		new_pte = pte_alloc_map(mm, new_pmd, 0);
-		if (!new_pte)
-			goto no_pte;
-
-		init_pmd = pmd_offset((pud_t *)init_pgd, 0);
-		init_pte = pte_offset_map(init_pmd, 0);
-		set_pte(new_pte, *init_pte);
-		pte_unmap(init_pte);
-		pte_unmap(new_pte);
-	}
-
-	return new_pgd;
-
-no_pte:
-	pmd_free(mm, new_pmd);
-	mm_dec_nr_pmds(mm);
-no_pmd:
-	free_pages((unsigned long)new_pgd, 0);
-no_pgd:
-	return NULL;
-}
-
-void free_pgd_slow(struct mm_struct *mm, pgd_t *pgd)
-{
-	pmd_t *pmd;
-	pgtable_t pte;
-
-	if (!pgd)
-		return;
-
-	/* pgd is always present and good */
-	pmd = pmd_off(pgd, 0);
-	if (pmd_none(*pmd))
-		goto free;
-	if (pmd_bad(*pmd)) {
-		pmd_ERROR(*pmd);
-		pmd_clear(pmd);
-		goto free;
-	}
-
-	pte = pmd_pgtable(*pmd);
-	pmd_clear(pmd);
-	pte_free(mm, pte);
-	mm_dec_nr_ptes(mm);
-	pmd_free(mm, pmd);
-	mm_dec_nr_pmds(mm);
-free:
-	free_pages((unsigned long) pgd, 0);
-}
diff --git a/arch/unicore32/mm/proc-macros.S b/arch/unicore32/mm/proc-macros.S
deleted file mode 100644
index 3b0ae7d5bd80..000000000000
--- a/arch/unicore32/mm/proc-macros.S
+++ /dev/null
@@ -1,142 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/*
- * linux/arch/unicore32/mm/proc-macros.S
- *
- * Code specific to PKUnity SoC and UniCore ISA
- *
- * Copyright (C) 2001-2010 GUAN Xue-tao
- *
- * We need constants.h for:
- *  VMA_VM_MM
- *  VMA_VM_FLAGS
- *  VM_EXEC
- */
-#include <generated/asm-offsets.h>
-#include <asm/thread_info.h>
-#include <asm/memory.h>
-
-/*
- * the cache line sizes of the I and D cache are the same
- */
-#define CACHE_LINESIZE	32
-
-/*
- * This is the maximum size of an area which will be invalidated
- * using the single invalidate entry instructions.  Anything larger
- * than this, and we go for the whole cache.
- *
- * This value should be chosen such that we choose the cheapest
- * alternative.
- */
-#ifdef CONFIG_CPU_UCV2
-#define MAX_AREA_SIZE	0x800		/* 64 cache line */
-#endif
-
-/*
- * vma_vm_mm - get mm pointer from vma pointer (vma->vm_mm)
- */
-	.macro	vma_vm_mm, rd, rn
-	ldw	\rd, [\rn+], #VMA_VM_MM
-	.endm
-
-/*
- * vma_vm_flags - get vma->vm_flags
- */
-	.macro	vma_vm_flags, rd, rn
-	ldw	\rd, [\rn+], #VMA_VM_FLAGS
-	.endm
-
-	.macro	tsk_mm, rd, rn
-	ldw	\rd, [\rn+], #TI_TASK
-	ldw	\rd, [\rd+], #TSK_ACTIVE_MM
-	.endm
-
-/*
- * act_mm - get current->active_mm
- */
-	.macro	act_mm, rd
-	andn	\rd, sp, #8128
-	andn	\rd, \rd, #63
-	ldw	\rd, [\rd+], #TI_TASK
-	ldw	\rd, [\rd+], #TSK_ACTIVE_MM
-	.endm
-
-/*
- * mmid - get context id from mm pointer (mm->context.id)
- */
-	.macro	mmid, rd, rn
-	ldw	\rd, [\rn+], #MM_CONTEXT_ID
-	.endm
-
-/*
- * mask_asid - mask the ASID from the context ID
- */
-	.macro	asid, rd, rn
-	and	\rd, \rn, #255
-	.endm
-
-	.macro	crval, clear, mmuset, ucset
-	.word	\clear
-	.word	\mmuset
-	.endm
-
-#ifndef CONFIG_CPU_DCACHE_LINE_DISABLE
-/*
- * va2pa va, pa, tbl, msk, off, err
- *	This macro is used to translate virtual address to its physical address.
- *
- *	va: virtual address
- *	pa: physical address, result is stored in this register
- *	tbl, msk, off:	temp registers, will be destroyed
- *	err: jump to error label if the physical address not exist
- * NOTE: all regs must be different
- */
-	.macro	va2pa, va, pa, tbl, msk, off, err=990f
-	movc	\pa, p0.c2, #0
-	mov	\off, \va >> #22		@ off <- index of 1st page table
-	adr	\tbl, 910f			@ tbl <- table of 1st page table
-900:						@ ---- handle 1, 2 page table
-	add	\pa, \pa, #PAGE_OFFSET		@ pa <- virt addr of page table
-	ldw	\pa, [\pa+], \off << #2		@ pa <- the content of pt
-	cand.a	\pa, #4				@ test exist bit
-	beq	\err				@ if not exist
-	and	\off, \pa, #3			@ off <- the last 2 bits
-	add	\tbl, \tbl, \off << #3		@ cmove table pointer
-	ldw	\msk, [\tbl+], #0		@ get the mask
-	ldw	pc, [\tbl+], #4
-930:						@ ---- handle 2nd page table
-	and	\pa, \pa, \msk			@ pa <- phys addr of 2nd pt
-	mov	\off, \va << #10
-	cntlo	\tbl, \msk			@ use tbl as temp reg
-	mov	\off, \off >> \tbl
-	mov	\off, \off >> #2		@ off <- index of 2nd pt
-	adr	\tbl, 920f			@ tbl <- table of 2nd pt
-	b	900b
-910:						@ 1st level page table
-	.word	0xfffff000, 930b		@ second level page table
-	.word	0xfffffc00, 930b		@ second level large page table
-	.word	0x00000000, \err		@ invalid
-	.word	0xffc00000, 980f		@ super page
-
-920:						@ 2nd level page table
-	.word	0xfffff000, 980f		@ page
-	.word	0xffffc000, 980f		@ middle page
-	.word	0xffff0000, 980f		@ large page
-	.word	0x00000000, \err		@ invalid
-980:
-	andn	\tbl, \va, \msk
-	and	\pa, \pa, \msk
-	or	\pa, \pa, \tbl
-990:
-	.endm
-#endif
-
-	.macro dcacheline_flush, addr, t1, t2
-	mov	\t1, \addr << #20
-	ldw	\t2, =_stext			@ _stext must ALIGN(4096)
-	add	\t2, \t2, \t1 >> #20
-	ldw	\t1, [\t2+], #0x0000
-	ldw	\t1, [\t2+], #0x1000
-	ldw	\t1, [\t2+], #0x2000
-	ldw	\t1, [\t2+], #0x3000
-	.endm
diff --git a/arch/unicore32/mm/proc-syms.c b/arch/unicore32/mm/proc-syms.c
deleted file mode 100644
index 6c081616fc3c..000000000000
--- a/arch/unicore32/mm/proc-syms.c
+++ /dev/null
@@ -1,19 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * linux/arch/unicore32/mm/proc-syms.c
- *
- * Code specific to PKUnity SoC and UniCore ISA
- *
- * Copyright (C) 2001-2010 GUAN Xue-tao
- */
-#include <linux/module.h>
-#include <linux/mm.h>
-
-#include <asm/cacheflush.h>
-#include <asm/tlbflush.h>
-#include <asm/page.h>
-
-EXPORT_SYMBOL(cpu_dcache_clean_area);
-EXPORT_SYMBOL(cpu_set_pte);
-
-EXPORT_SYMBOL(__cpuc_coherent_kern_range);
diff --git a/arch/unicore32/mm/proc-ucv2.S b/arch/unicore32/mm/proc-ucv2.S
deleted file mode 100644
index 18f8c4fb21a0..000000000000
--- a/arch/unicore32/mm/proc-ucv2.S
+++ /dev/null
@@ -1,131 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/*
- * linux/arch/unicore32/mm/proc-ucv2.S
- *
- * Code specific to PKUnity SoC and UniCore ISA
- *
- * Copyright (C) 2001-2010 GUAN Xue-tao
- */
-#include <linux/init.h>
-#include <linux/linkage.h>
-#include <linux/pgtable.h>
-#include <asm/assembler.h>
-#include <asm/hwcap.h>
-#include <asm/pgtable-hwdef.h>
-
-#include "proc-macros.S"
-
-ENTRY(cpu_proc_fin)
-	stm.w	(lr), [sp-]
-	mov	ip, #PSR_R_BIT | PSR_I_BIT | PRIV_MODE
-	mov.a	asr, ip
-	b.l	__cpuc_flush_kern_all
-	ldm.w	(pc), [sp]+
-
-/*
- *	cpu_reset(loc)
- *
- *	Perform a soft reset of the system.  Put the CPU into the
- *	same state as it would be if it had been reset, and branch
- *	to what would be the reset vector.
- *
- *	- loc   - location to jump to for soft reset
- */
-	.align	5
-ENTRY(cpu_reset)
-	mov	ip, #0
-	movc	p0.c5, ip, #28			@ Cache invalidate all
-	nop8
-
-	movc	p0.c6, ip, #6			@ TLB invalidate all
-	nop8
-
-	movc	ip, p0.c1, #0			@ ctrl register
-	or	ip, ip, #0x2000			@ vector base address
-	andn	ip, ip, #0x000f			@ ............idam
-	movc	p0.c1, ip, #0			@ disable caches and mmu
-	nop
-	mov	pc, r0				@ jump to loc
-	nop8
-
-/*
- *	cpu_do_idle()
- *
- *	Idle the processor (eg, wait for interrupt).
- *
- *	IRQs are already disabled.
- */
-ENTRY(cpu_do_idle)
-	mov	r0, #0				@ PCI address
-	.rept	8
-	ldw	r1, [r0]
-	.endr
-	mov	pc, lr
-
-ENTRY(cpu_dcache_clean_area)
-#ifndef CONFIG_CPU_DCACHE_LINE_DISABLE
-	csub.a	r1, #MAX_AREA_SIZE
-	bsg	101f
-	mov	r9, #PAGE_SZ
-	sub	r9, r9, #1			@ PAGE_MASK
-1:	va2pa	r0, r10, r11, r12, r13		@ r10 is PA
-	b	3f
-2:	cand.a	r0, r9
-	beq	1b
-3:	movc	p0.c5, r10, #11			@ clean D entry
-	nop8
-	add	r0, r0, #CACHE_LINESIZE
-	add	r10, r10, #CACHE_LINESIZE
-	sub.a	r1, r1, #CACHE_LINESIZE
-	bua	2b
-	mov	pc, lr
-#endif
-101:	mov	ip, #0
-	movc	p0.c5, ip, #10			@ Dcache clean all
-	nop8
-
-	mov	pc, lr
-
-/*
- *	cpu_do_switch_mm(pgd_phys)
- *
- *	Set the translation table base pointer to be pgd_phys
- *
- *	- pgd_phys - physical address of new pgd
- *
- *	It is assumed that:
- *	- we are not using split page tables
- */
-	.align	5
-ENTRY(cpu_do_switch_mm)
-	movc	p0.c2, r0, #0			@ update page table ptr
-	nop8
-
-	movc	p0.c6, ip, #6			@ TLB invalidate all
-	nop8
-
-	mov	pc, lr
-
-/*
- *	cpu_set_pte(ptep, pte)
- *
- *	Set a level 2 translation table entry.
- *
- *	- ptep  - pointer to level 2 translation table entry
- *	- pte   - PTE value to store
- */
-	.align	5
-ENTRY(cpu_set_pte)
-	stw	r1, [r0]
-#ifndef CONFIG_CPU_DCACHE_LINE_DISABLE
-	sub	r2, r0, #PAGE_OFFSET
-	movc	p0.c5, r2, #11				@ Dcache clean line
-	nop8
-#else
-	mov	ip, #0
-	movc	p0.c5, ip, #10				@ Dcache clean all
-	nop8
-	@dcacheline_flush	r0, r2, ip
-#endif
-	mov	pc, lr
-
diff --git a/arch/unicore32/mm/tlb-ucv2.S b/arch/unicore32/mm/tlb-ucv2.S
deleted file mode 100644
index 0ce9c6b6f1db..000000000000
--- a/arch/unicore32/mm/tlb-ucv2.S
+++ /dev/null
@@ -1,86 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/*
- * linux/arch/unicore32/mm/tlb-ucv2.S
- *
- * Code specific to PKUnity SoC and UniCore ISA
- *
- * Copyright (C) 2001-2010 GUAN Xue-tao
- */
-#include <linux/init.h>
-#include <linux/linkage.h>
-#include <asm/assembler.h>
-#include <asm/page.h>
-#include <asm/tlbflush.h>
-#include "proc-macros.S"
-
-/*
- *	__cpu_flush_user_tlb_range(start, end, vma)
- *
- *	Invalidate a range of TLB entries in the specified address space.
- *
- *	- start - start address (may not be aligned)
- *	- end   - end address (exclusive, may not be aligned)
- *	- vma   - vma_struct describing address range
- */
-ENTRY(__cpu_flush_user_tlb_range)
-#ifndef	CONFIG_CPU_TLB_SINGLE_ENTRY_DISABLE
-	mov	r0, r0 >> #PAGE_SHIFT		@ align address
-	mov	r0, r0 << #PAGE_SHIFT
-	vma_vm_flags r2, r2			@ get vma->vm_flags
-1:
-	movc	p0.c6, r0, #3
-	nop8
-
-	cand.a	r2, #VM_EXEC			@ Executable area ?
-	beq	2f
-
-	movc	p0.c6, r0, #5
-	nop8
-2:
-	add	r0, r0, #PAGE_SZ
-	csub.a	r0, r1
-	beb	1b
-#else
-	movc	p0.c6, r0, #2
-	nop8
-
-	cand.a	r2, #VM_EXEC			@ Executable area ?
-	beq	2f
-
-	movc	p0.c6, r0, #4
-	nop8
-2:
-#endif
-	mov	pc, lr
-
-/*
- *	__cpu_flush_kern_tlb_range(start,end)
- *
- *	Invalidate a range of kernel TLB entries
- *
- *	- start - start address (may not be aligned)
- *	- end   - end address (exclusive, may not be aligned)
- */
-ENTRY(__cpu_flush_kern_tlb_range)
-#ifndef	CONFIG_CPU_TLB_SINGLE_ENTRY_DISABLE
-	mov	r0, r0 >> #PAGE_SHIFT		@ align address
-	mov	r0, r0 << #PAGE_SHIFT
-1:
-	movc	p0.c6, r0, #3
-	nop8
-
-	movc	p0.c6, r0, #5
-	nop8
-
-	add	r0, r0, #PAGE_SZ
-	csub.a	r0, r1
-	beb	1b
-#else
-	movc	p0.c6, r0, #2
-	nop8
-
-	movc	p0.c6, r0, #4
-	nop8
-#endif
-	mov	pc, lr
-
diff --git a/kernel/reboot.c b/kernel/reboot.c
index 491f1347bf43..e7b78d5ae1ab 100644
--- a/kernel/reboot.c
+++ b/kernel/reboot.c
@@ -26,7 +26,7 @@ int C_A_D = 1;
 struct pid *cad_pid;
 EXPORT_SYMBOL(cad_pid);
 
-#if defined(CONFIG_ARM) || defined(CONFIG_UNICORE32)
+#if defined(CONFIG_ARM)
 #define DEFAULT_REBOOT_MODE		= REBOOT_HARD
 #else
 #define DEFAULT_REBOOT_MODE

From 5853d602dc581b891cc591ec5f67024bcaacdfcf Mon Sep 17 00:00:00 2001
From: Mike Rapoport <rppt@linux.ibm.com>
Date: Wed, 10 Jun 2020 09:57:50 +0300
Subject: [PATCH 230/502] cpufreq: remove unicore32 driver

The unicore32 port is removed from the kernel.
There is no point to keep stale cpufreq driver for this architecture.

Signed-off-by: Mike Rapoport <rppt@linux.ibm.com>
Acked-by: Arnd Bergmann <arnd@arndb.de>
Acked-by: Guenter Roeck <linux@roeck-us.net>
---
 drivers/cpufreq/Makefile           |  1 -
 drivers/cpufreq/unicore2-cpufreq.c | 76 ------------------------------
 2 files changed, 77 deletions(-)
 delete mode 100644 drivers/cpufreq/unicore2-cpufreq.c

diff --git a/drivers/cpufreq/Makefile b/drivers/cpufreq/Makefile
index f6670c4abbb0..089938ead681 100644
--- a/drivers/cpufreq/Makefile
+++ b/drivers/cpufreq/Makefile
@@ -108,4 +108,3 @@ obj-$(CONFIG_LOONGSON1_CPUFREQ)		+= loongson1-cpufreq.o
 obj-$(CONFIG_SH_CPU_FREQ)		+= sh-cpufreq.o
 obj-$(CONFIG_SPARC_US2E_CPUFREQ)	+= sparc-us2e-cpufreq.o
 obj-$(CONFIG_SPARC_US3_CPUFREQ)		+= sparc-us3-cpufreq.o
-obj-$(CONFIG_UNICORE32)			+= unicore2-cpufreq.o
diff --git a/drivers/cpufreq/unicore2-cpufreq.c b/drivers/cpufreq/unicore2-cpufreq.c
deleted file mode 100644
index 98d392196df2..000000000000
--- a/drivers/cpufreq/unicore2-cpufreq.c
+++ /dev/null
@@ -1,76 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * clock scaling for the UniCore-II
- *
- * Code specific to PKUnity SoC and UniCore ISA
- *
- *	Maintained by GUAN Xue-tao <gxt@mprc.pku.edu.cn>
- *	Copyright (C) 2001-2010 Guan Xuetao
- */
-
-#include <linux/err.h>
-#include <linux/kernel.h>
-#include <linux/types.h>
-#include <linux/init.h>
-#include <linux/clk.h>
-#include <linux/cpufreq.h>
-
-#include <mach/hardware.h>
-
-static struct cpufreq_driver ucv2_driver;
-
-/* make sure that only the "userspace" governor is run
- * -- anything else wouldn't make sense on this platform, anyway.
- */
-static int ucv2_verify_speed(struct cpufreq_policy_data *policy)
-{
-	if (policy->cpu)
-		return -EINVAL;
-
-	cpufreq_verify_within_cpu_limits(policy);
-	return 0;
-}
-
-static int ucv2_target(struct cpufreq_policy *policy,
-			 unsigned int target_freq,
-			 unsigned int relation)
-{
-	struct cpufreq_freqs freqs;
-	int ret;
-
-	freqs.old = policy->cur;
-	freqs.new = target_freq;
-
-	cpufreq_freq_transition_begin(policy, &freqs);
-	ret = clk_set_rate(policy->clk, target_freq * 1000);
-	cpufreq_freq_transition_end(policy, &freqs, ret);
-
-	return ret;
-}
-
-static int __init ucv2_cpu_init(struct cpufreq_policy *policy)
-{
-	if (policy->cpu != 0)
-		return -EINVAL;
-
-	policy->min = policy->cpuinfo.min_freq = 250000;
-	policy->max = policy->cpuinfo.max_freq = 1000000;
-	policy->clk = clk_get(NULL, "MAIN_CLK");
-	return PTR_ERR_OR_ZERO(policy->clk);
-}
-
-static struct cpufreq_driver ucv2_driver = {
-	.flags		= CPUFREQ_STICKY | CPUFREQ_NO_AUTO_DYNAMIC_SWITCHING,
-	.verify		= ucv2_verify_speed,
-	.target		= ucv2_target,
-	.get		= cpufreq_generic_get,
-	.init		= ucv2_cpu_init,
-	.name		= "UniCore-II",
-};
-
-static int __init ucv2_cpufreq_init(void)
-{
-	return cpufreq_register_driver(&ucv2_driver);
-}
-
-arch_initcall(ucv2_cpufreq_init);

From c59e68250c4b317c99f1d1a1e8f990fd8e608afd Mon Sep 17 00:00:00 2001
From: Mike Rapoport <rppt@linux.ibm.com>
Date: Wed, 10 Jun 2020 09:57:50 +0300
Subject: [PATCH 231/502] i2c/buses: remove i2c-puv3  driver

The unicore32 port is removed from the kernel.
There is no point to keep stale i2c bus driver for this architecture.

Signed-off-by: Mike Rapoport <rppt@linux.ibm.com>
Acked-by: Arnd Bergmann <arnd@arndb.de>
Acked-by: Guenter Roeck <linux@roeck-us.net>
---
 MAINTAINERS                   |   1 -
 drivers/i2c/busses/Kconfig    |  11 --
 drivers/i2c/busses/Makefile   |   1 -
 drivers/i2c/busses/i2c-puv3.c | 275 ----------------------------------
 4 files changed, 288 deletions(-)
 delete mode 100644 drivers/i2c/busses/i2c-puv3.c

diff --git a/MAINTAINERS b/MAINTAINERS
index 1de95aa44bbb..ec65e063e258 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -13585,7 +13585,6 @@ M:	Guan Xuetao <gxt@pku.edu.cn>
 S:	Maintained
 W:	http://mprc.pku.edu.cn/~guanxuetao/linux
 T:	git git://github.com/gxt/linux.git
-F:	drivers/i2c/busses/i2c-puv3.c
 F:	drivers/input/serio/i8042-unicore32io.h
 F:	drivers/rtc/rtc-puv3.c
 F:	drivers/video/fbdev/fb-puv3.c
diff --git a/drivers/i2c/busses/Kconfig b/drivers/i2c/busses/Kconfig
index 735bf31a3fdf..88639e52c73a 100644
--- a/drivers/i2c/busses/Kconfig
+++ b/drivers/i2c/busses/Kconfig
@@ -866,17 +866,6 @@ config I2C_PNX
 	  This driver can also be built as a module.  If so, the module
 	  will be called i2c-pnx.
 
-config I2C_PUV3
-	tristate "PKUnity v3 I2C bus support"
-	depends on UNICORE32 && ARCH_PUV3
-	select I2C_ALGOBIT
-	help
-	  This driver supports the I2C IP inside the PKUnity-v3 SoC.
-	  This I2C bus controller is under AMBA/AXI bus.
-
-	  This driver can also be built as a module.  If so, the module
-	  will be called i2c-puv3.
-
 config I2C_PXA
 	tristate "Intel PXA2XX I2C adapter"
 	depends on ARCH_PXA || ARCH_MMP || ARCH_MVEBU || (X86_32 && PCI && OF) || COMPILE_TEST
diff --git a/drivers/i2c/busses/Makefile b/drivers/i2c/busses/Makefile
index 306d5dc3f417..19aff0e45cb5 100644
--- a/drivers/i2c/busses/Makefile
+++ b/drivers/i2c/busses/Makefile
@@ -88,7 +88,6 @@ obj-$(CONFIG_I2C_PASEMI)	+= i2c-pasemi.o
 obj-$(CONFIG_I2C_PCA_PLATFORM)	+= i2c-pca-platform.o
 obj-$(CONFIG_I2C_PMCMSP)	+= i2c-pmcmsp.o
 obj-$(CONFIG_I2C_PNX)		+= i2c-pnx.o
-obj-$(CONFIG_I2C_PUV3)		+= i2c-puv3.o
 obj-$(CONFIG_I2C_PXA)		+= i2c-pxa.o
 obj-$(CONFIG_I2C_PXA_PCI)	+= i2c-pxa-pci.o
 obj-$(CONFIG_I2C_QCOM_CCI)	+= i2c-qcom-cci.o
diff --git a/drivers/i2c/busses/i2c-puv3.c b/drivers/i2c/busses/i2c-puv3.c
deleted file mode 100644
index 5cec5a36807d..000000000000
--- a/drivers/i2c/busses/i2c-puv3.c
+++ /dev/null
@@ -1,275 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * I2C driver for PKUnity-v3 SoC
- * Code specific to PKUnity SoC and UniCore ISA
- *
- *	Maintained by GUAN Xue-tao <gxt@mprc.pku.edu.cn>
- *	Copyright (C) 2001-2010 Guan Xuetao
- */
-
-#include <linux/module.h>
-#include <linux/kernel.h>
-#include <linux/err.h>
-#include <linux/slab.h>
-#include <linux/types.h>
-#include <linux/delay.h>
-#include <linux/i2c.h>
-#include <linux/clk.h>
-#include <linux/platform_device.h>
-#include <linux/io.h>
-#include <mach/hardware.h>
-
-/*
- * Poll the i2c status register until the specified bit is set.
- * Returns 0 if timed out (100 msec).
- */
-static short poll_status(unsigned long bit)
-{
-	int loop_cntr = 1000;
-
-	if (bit & I2C_STATUS_TFNF) {
-		do {
-			udelay(10);
-		} while (!(readl(I2C_STATUS) & bit) && (--loop_cntr > 0));
-	} else {
-		/* RXRDY handler */
-		do {
-			if (readl(I2C_TAR) == I2C_TAR_EEPROM)
-				msleep(20);
-			else
-				udelay(10);
-		} while (!(readl(I2C_RXFLR) & 0xf) && (--loop_cntr > 0));
-	}
-
-	return (loop_cntr > 0);
-}
-
-static int xfer_read(struct i2c_adapter *adap, unsigned char *buf, int length)
-{
-	int i2c_reg = *buf;
-
-	/* Read data */
-	while (length--) {
-		if (!poll_status(I2C_STATUS_TFNF)) {
-			dev_dbg(&adap->dev, "Tx FIFO Not Full timeout\n");
-			return -ETIMEDOUT;
-		}
-
-		/* send addr */
-		writel(i2c_reg | I2C_DATACMD_WRITE, I2C_DATACMD);
-
-		/* get ready to next write */
-		i2c_reg++;
-
-		/* send read CMD */
-		writel(I2C_DATACMD_READ, I2C_DATACMD);
-
-		/* wait until the Rx FIFO have available */
-		if (!poll_status(I2C_STATUS_RFNE)) {
-			dev_dbg(&adap->dev, "RXRDY timeout\n");
-			return -ETIMEDOUT;
-		}
-
-		/* read the data to buf */
-		*buf = (readl(I2C_DATACMD) & I2C_DATACMD_DAT_MASK);
-		buf++;
-	}
-
-	return 0;
-}
-
-static int xfer_write(struct i2c_adapter *adap, unsigned char *buf, int length)
-{
-	int i2c_reg = *buf;
-
-	/* Do nothing but storing the reg_num to a static variable */
-	if (i2c_reg == -1) {
-		printk(KERN_WARNING "Error i2c reg\n");
-		return -ETIMEDOUT;
-	}
-
-	if (length == 1)
-		return 0;
-
-	buf++;
-	length--;
-	while (length--) {
-		/* send addr */
-		writel(i2c_reg | I2C_DATACMD_WRITE, I2C_DATACMD);
-
-		/* send write CMD */
-		writel(*buf | I2C_DATACMD_WRITE, I2C_DATACMD);
-
-		/* wait until the Rx FIFO have available */
-		msleep(20);
-
-		/* read the data to buf */
-		i2c_reg++;
-		buf++;
-	}
-
-	return 0;
-}
-
-/*
- * Generic i2c master transfer entrypoint.
- *
- */
-static int puv3_i2c_xfer(struct i2c_adapter *adap, struct i2c_msg *pmsg,
-		int num)
-{
-	int i, ret;
-	unsigned char swap;
-
-	/* Disable i2c */
-	writel(I2C_ENABLE_DISABLE, I2C_ENABLE);
-
-	/* Set the work mode and speed*/
-	writel(I2C_CON_MASTER | I2C_CON_SPEED_STD | I2C_CON_SLAVEDISABLE, I2C_CON);
-
-	writel(pmsg->addr, I2C_TAR);
-
-	/* Enable i2c */
-	writel(I2C_ENABLE_ENABLE, I2C_ENABLE);
-
-	dev_dbg(&adap->dev, "puv3_i2c_xfer: processing %d messages:\n", num);
-
-	for (i = 0; i < num; i++) {
-		dev_dbg(&adap->dev, " #%d: %sing %d byte%s %s 0x%02x\n", i,
-			pmsg->flags & I2C_M_RD ? "read" : "writ",
-			pmsg->len, pmsg->len > 1 ? "s" : "",
-			pmsg->flags & I2C_M_RD ? "from" : "to",	pmsg->addr);
-
-		if (pmsg->len && pmsg->buf) {	/* sanity check */
-			if (pmsg->flags & I2C_M_RD)
-				ret = xfer_read(adap, pmsg->buf, pmsg->len);
-			else
-				ret = xfer_write(adap, pmsg->buf, pmsg->len);
-
-			if (ret)
-				return ret;
-
-		}
-		dev_dbg(&adap->dev, "transfer complete\n");
-		pmsg++;		/* next message */
-	}
-
-	/* XXX: fixup be16_to_cpu in bq27x00_battery.c */
-	if (pmsg->addr == I2C_TAR_PWIC) {
-		swap = pmsg->buf[0];
-		pmsg->buf[0] = pmsg->buf[1];
-		pmsg->buf[1] = swap;
-	}
-
-	return i;
-}
-
-/*
- * Return list of supported functionality.
- */
-static u32 puv3_i2c_func(struct i2c_adapter *adapter)
-{
-	return I2C_FUNC_I2C | I2C_FUNC_SMBUS_EMUL;
-}
-
-static const struct i2c_algorithm puv3_i2c_algorithm = {
-	.master_xfer	= puv3_i2c_xfer,
-	.functionality	= puv3_i2c_func,
-};
-
-/*
- * Main initialization routine.
- */
-static int puv3_i2c_probe(struct platform_device *pdev)
-{
-	struct i2c_adapter *adapter;
-	struct resource *mem;
-	int rc;
-
-	mem = platform_get_resource(pdev, IORESOURCE_MEM, 0);
-	if (!mem)
-		return -ENODEV;
-
-	if (!request_mem_region(mem->start, resource_size(mem), "puv3_i2c"))
-		return -EBUSY;
-
-	adapter = kzalloc(sizeof(struct i2c_adapter), GFP_KERNEL);
-	if (adapter == NULL) {
-		dev_err(&pdev->dev, "can't allocate interface!\n");
-		rc = -ENOMEM;
-		goto fail_nomem;
-	}
-	snprintf(adapter->name, sizeof(adapter->name), "PUV3-I2C at 0x%08x",
-			mem->start);
-	adapter->algo = &puv3_i2c_algorithm;
-	adapter->class = I2C_CLASS_HWMON;
-	adapter->dev.parent = &pdev->dev;
-
-	platform_set_drvdata(pdev, adapter);
-
-	adapter->nr = pdev->id;
-	rc = i2c_add_numbered_adapter(adapter);
-	if (rc)
-		goto fail_add_adapter;
-
-	dev_info(&pdev->dev, "PKUnity v3 i2c bus adapter.\n");
-	return 0;
-
-fail_add_adapter:
-	kfree(adapter);
-fail_nomem:
-	release_mem_region(mem->start, resource_size(mem));
-
-	return rc;
-}
-
-static int puv3_i2c_remove(struct platform_device *pdev)
-{
-	struct i2c_adapter *adapter = platform_get_drvdata(pdev);
-	struct resource *mem;
-
-	i2c_del_adapter(adapter);
-
-	put_device(&pdev->dev);
-
-	mem = platform_get_resource(pdev, IORESOURCE_MEM, 0);
-	release_mem_region(mem->start, resource_size(mem));
-
-	return 0;
-}
-
-#ifdef CONFIG_PM_SLEEP
-static int puv3_i2c_suspend(struct device *dev)
-{
-	int poll_count;
-	/* Disable the IIC */
-	writel(I2C_ENABLE_DISABLE, I2C_ENABLE);
-	for (poll_count = 0; poll_count < 50; poll_count++) {
-		if (readl(I2C_ENSTATUS) & I2C_ENSTATUS_ENABLE)
-			udelay(25);
-	}
-
-	return 0;
-}
-
-static SIMPLE_DEV_PM_OPS(puv3_i2c_pm, puv3_i2c_suspend, NULL);
-#define PUV3_I2C_PM	(&puv3_i2c_pm)
-
-#else
-#define PUV3_I2C_PM	NULL
-#endif
-
-static struct platform_driver puv3_i2c_driver = {
-	.probe		= puv3_i2c_probe,
-	.remove		= puv3_i2c_remove,
-	.driver		= {
-		.name	= "PKUnity-v3-I2C",
-		.pm	= PUV3_I2C_PM,
-	}
-};
-
-module_platform_driver(puv3_i2c_driver);
-
-MODULE_DESCRIPTION("PKUnity v3 I2C driver");
-MODULE_LICENSE("GPL v2");
-MODULE_ALIAS("platform:puv3_i2c");

From a559063a6865357f5ae2c407a092a75ae9f1c84d Mon Sep 17 00:00:00 2001
From: Mike Rapoport <rppt@linux.ibm.com>
Date: Wed, 10 Jun 2020 09:57:50 +0300
Subject: [PATCH 232/502] input: i8042: remove support for 8042-unicore32io

The unicore32 port is removed from the kernel.
There is no point to keep stale definitions to support this architecture.

Signed-off-by: Mike Rapoport <rppt@linux.ibm.com>
Acked-by: Arnd Bergmann <arnd@arndb.de>
Acked-by: Guenter Roeck <linux@roeck-us.net>
---
 MAINTAINERS                             |  1 -
 drivers/input/serio/i8042-unicore32io.h | 70 -------------------------
 drivers/input/serio/i8042.h             |  2 -
 3 files changed, 73 deletions(-)
 delete mode 100644 drivers/input/serio/i8042-unicore32io.h

diff --git a/MAINTAINERS b/MAINTAINERS
index ec65e063e258..e5035fda296e 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -13585,7 +13585,6 @@ M:	Guan Xuetao <gxt@pku.edu.cn>
 S:	Maintained
 W:	http://mprc.pku.edu.cn/~guanxuetao/linux
 T:	git git://github.com/gxt/linux.git
-F:	drivers/input/serio/i8042-unicore32io.h
 F:	drivers/rtc/rtc-puv3.c
 F:	drivers/video/fbdev/fb-puv3.c
 
diff --git a/drivers/input/serio/i8042-unicore32io.h b/drivers/input/serio/i8042-unicore32io.h
deleted file mode 100644
index 50bb3ed94b56..000000000000
--- a/drivers/input/serio/i8042-unicore32io.h
+++ /dev/null
@@ -1,70 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/*
- * Code specific to PKUnity SoC and UniCore ISA
- *
- *	Maintained by GUAN Xue-tao <gxt@mprc.pku.edu.cn>
- *	Copyright (C) 2001-2011 Guan Xuetao
- */
-#ifndef _I8042_UNICORE32_H
-#define _I8042_UNICORE32_H
-
-#include <mach/hardware.h>
-
-/*
- * Names.
- */
-#define I8042_KBD_PHYS_DESC "isa0060/serio0"
-#define I8042_AUX_PHYS_DESC "isa0060/serio1"
-#define I8042_MUX_PHYS_DESC "isa0060/serio%d"
-
-/*
- * IRQs.
- */
-#define I8042_KBD_IRQ           IRQ_PS2_KBD
-#define I8042_AUX_IRQ           IRQ_PS2_AUX
-
-/*
- * Register numbers.
- */
-#define I8042_COMMAND_REG	PS2_COMMAND
-#define I8042_STATUS_REG	PS2_STATUS
-#define I8042_DATA_REG		PS2_DATA
-
-#define I8042_REGION_START	(resource_size_t)(PS2_DATA)
-#define I8042_REGION_SIZE	(resource_size_t)(16)
-
-static inline int i8042_read_data(void)
-{
-	return readb(I8042_DATA_REG);
-}
-
-static inline int i8042_read_status(void)
-{
-	return readb(I8042_STATUS_REG);
-}
-
-static inline void i8042_write_data(int val)
-{
-	writeb(val, I8042_DATA_REG);
-}
-
-static inline void i8042_write_command(int val)
-{
-	writeb(val, I8042_COMMAND_REG);
-}
-
-static inline int i8042_platform_init(void)
-{
-	if (!request_mem_region(I8042_REGION_START, I8042_REGION_SIZE, "i8042"))
-		return -EBUSY;
-
-	i8042_reset = I8042_RESET_ALWAYS;
-	return 0;
-}
-
-static inline void i8042_platform_exit(void)
-{
-	release_mem_region(I8042_REGION_START, I8042_REGION_SIZE);
-}
-
-#endif /* _I8042_UNICORE32_H */
diff --git a/drivers/input/serio/i8042.h b/drivers/input/serio/i8042.h
index eb376700dfff..55381783dc82 100644
--- a/drivers/input/serio/i8042.h
+++ b/drivers/input/serio/i8042.h
@@ -21,8 +21,6 @@
 #include "i8042-sparcio.h"
 #elif defined(CONFIG_X86) || defined(CONFIG_IA64)
 #include "i8042-x86ia64io.h"
-#elif defined(CONFIG_UNICORE32)
-#include "i8042-unicore32io.h"
 #else
 #include "i8042-io.h"
 #endif

From a2022e1cf368c5d8794b75a9b5eb5f078a9bdb76 Mon Sep 17 00:00:00 2001
From: Mike Rapoport <rppt@linux.ibm.com>
Date: Wed, 10 Jun 2020 09:57:50 +0300
Subject: [PATCH 233/502] pwm: remove pwm-puv3  driver

The unicore32 port is removed from the kernel.
There is no point to keep stale PWM driver for this architecture.

Signed-off-by: Mike Rapoport <rppt@linux.ibm.com>
Acked-by: Arnd Bergmann <arnd@arndb.de>
Acked-by: Guenter Roeck <linux@roeck-us.net>
---
 drivers/pwm/Kconfig    |   9 ---
 drivers/pwm/Makefile   |   1 -
 drivers/pwm/pwm-puv3.c | 150 -----------------------------------------
 3 files changed, 160 deletions(-)
 delete mode 100644 drivers/pwm/pwm-puv3.c

diff --git a/drivers/pwm/Kconfig b/drivers/pwm/Kconfig
index cb8d739067d2..7dbcf6973d33 100644
--- a/drivers/pwm/Kconfig
+++ b/drivers/pwm/Kconfig
@@ -370,15 +370,6 @@ config PWM_PCA9685
 	  To compile this driver as a module, choose M here: the module
 	  will be called pwm-pca9685.
 
-config PWM_PUV3
-	tristate "PKUnity NetBook-0916 PWM support"
-	depends on ARCH_PUV3
-	help
-	  Generic PWM framework driver for PKUnity NetBook-0916.
-
-	  To compile this driver as a module, choose M here: the module
-	  will be called pwm-puv3.
-
 config PWM_PXA
 	tristate "PXA PWM support"
 	depends on ARCH_PXA || COMPILE_TEST
diff --git a/drivers/pwm/Makefile b/drivers/pwm/Makefile
index a59c710e98c7..2c2ba0a03557 100644
--- a/drivers/pwm/Makefile
+++ b/drivers/pwm/Makefile
@@ -34,7 +34,6 @@ obj-$(CONFIG_PWM_MTK_DISP)	+= pwm-mtk-disp.o
 obj-$(CONFIG_PWM_MXS)		+= pwm-mxs.o
 obj-$(CONFIG_PWM_OMAP_DMTIMER)	+= pwm-omap-dmtimer.o
 obj-$(CONFIG_PWM_PCA9685)	+= pwm-pca9685.o
-obj-$(CONFIG_PWM_PUV3)		+= pwm-puv3.o
 obj-$(CONFIG_PWM_PXA)		+= pwm-pxa.o
 obj-$(CONFIG_PWM_RCAR)		+= pwm-rcar.o
 obj-$(CONFIG_PWM_RENESAS_TPU)	+= pwm-renesas-tpu.o
diff --git a/drivers/pwm/pwm-puv3.c b/drivers/pwm/pwm-puv3.c
deleted file mode 100644
index 9d0bd87a425e..000000000000
--- a/drivers/pwm/pwm-puv3.c
+++ /dev/null
@@ -1,150 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * linux/arch/unicore32/kernel/pwm.c
- *
- * Code specific to PKUnity SoC and UniCore ISA
- *
- *	Maintained by GUAN Xue-tao <gxt@mprc.pku.edu.cn>
- *	Copyright (C) 2001-2010 Guan Xuetao
- */
-
-#include <linux/module.h>
-#include <linux/kernel.h>
-#include <linux/platform_device.h>
-#include <linux/slab.h>
-#include <linux/err.h>
-#include <linux/clk.h>
-#include <linux/io.h>
-#include <linux/pwm.h>
-
-#include <asm/div64.h>
-#include <mach/hardware.h>
-
-struct puv3_pwm_chip {
-	struct pwm_chip chip;
-	void __iomem *base;
-	struct clk *clk;
-};
-
-static inline struct puv3_pwm_chip *to_puv3(struct pwm_chip *chip)
-{
-	return container_of(chip, struct puv3_pwm_chip, chip);
-}
-
-/*
- * period_ns = 10^9 * (PRESCALE + 1) * (PV + 1) / PWM_CLK_RATE
- * duty_ns   = 10^9 * (PRESCALE + 1) * DC / PWM_CLK_RATE
- */
-static int puv3_pwm_config(struct pwm_chip *chip, struct pwm_device *pwm,
-			   int duty_ns, int period_ns)
-{
-	unsigned long period_cycles, prescale, pv, dc;
-	struct puv3_pwm_chip *puv3 = to_puv3(chip);
-	unsigned long long c;
-
-	c = clk_get_rate(puv3->clk);
-	c = c * period_ns;
-	do_div(c, 1000000000);
-	period_cycles = c;
-
-	if (period_cycles < 1)
-		period_cycles = 1;
-
-	prescale = (period_cycles - 1) / 1024;
-	pv = period_cycles / (prescale + 1) - 1;
-
-	if (prescale > 63)
-		return -EINVAL;
-
-	if (duty_ns == period_ns)
-		dc = OST_PWMDCCR_FDCYCLE;
-	else
-		dc = (pv + 1) * duty_ns / period_ns;
-
-	/*
-	 * NOTE: the clock to PWM has to be enabled first
-	 * before writing to the registers
-	 */
-	clk_prepare_enable(puv3->clk);
-
-	writel(prescale, puv3->base + OST_PWM_PWCR);
-	writel(pv - dc, puv3->base + OST_PWM_DCCR);
-	writel(pv, puv3->base + OST_PWM_PCR);
-
-	clk_disable_unprepare(puv3->clk);
-
-	return 0;
-}
-
-static int puv3_pwm_enable(struct pwm_chip *chip, struct pwm_device *pwm)
-{
-	struct puv3_pwm_chip *puv3 = to_puv3(chip);
-
-	return clk_prepare_enable(puv3->clk);
-}
-
-static void puv3_pwm_disable(struct pwm_chip *chip, struct pwm_device *pwm)
-{
-	struct puv3_pwm_chip *puv3 = to_puv3(chip);
-
-	clk_disable_unprepare(puv3->clk);
-}
-
-static const struct pwm_ops puv3_pwm_ops = {
-	.config = puv3_pwm_config,
-	.enable = puv3_pwm_enable,
-	.disable = puv3_pwm_disable,
-	.owner = THIS_MODULE,
-};
-
-static int pwm_probe(struct platform_device *pdev)
-{
-	struct puv3_pwm_chip *puv3;
-	struct resource *r;
-	int ret;
-
-	puv3 = devm_kzalloc(&pdev->dev, sizeof(*puv3), GFP_KERNEL);
-	if (!puv3)
-		return -ENOMEM;
-
-	puv3->clk = devm_clk_get(&pdev->dev, "OST_CLK");
-	if (IS_ERR(puv3->clk))
-		return PTR_ERR(puv3->clk);
-
-	r = platform_get_resource(pdev, IORESOURCE_MEM, 0);
-	puv3->base = devm_ioremap_resource(&pdev->dev, r);
-	if (IS_ERR(puv3->base))
-		return PTR_ERR(puv3->base);
-
-	puv3->chip.dev = &pdev->dev;
-	puv3->chip.ops = &puv3_pwm_ops;
-	puv3->chip.base = -1;
-	puv3->chip.npwm = 1;
-
-	ret = pwmchip_add(&puv3->chip);
-	if (ret < 0) {
-		dev_err(&pdev->dev, "pwmchip_add() failed: %d\n", ret);
-		return ret;
-	}
-
-	platform_set_drvdata(pdev, puv3);
-	return 0;
-}
-
-static int pwm_remove(struct platform_device *pdev)
-{
-	struct puv3_pwm_chip *puv3 = platform_get_drvdata(pdev);
-
-	return pwmchip_remove(&puv3->chip);
-}
-
-static struct platform_driver puv3_pwm_driver = {
-	.driver = {
-		.name = "PKUnity-v3-PWM",
-	},
-	.probe = pwm_probe,
-	.remove = pwm_remove,
-};
-module_platform_driver(puv3_pwm_driver);
-
-MODULE_LICENSE("GPL v2");

From e26e59190ecd0b09a8778bbdc8239d0db78903c9 Mon Sep 17 00:00:00 2001
From: Mike Rapoport <rppt@linux.ibm.com>
Date: Wed, 10 Jun 2020 09:57:50 +0300
Subject: [PATCH 234/502] video: fbdev: remove fb-puv3  driver

The unicore32 port is removed from the kernel.
There is no point to keep stale fbdev driver for this architecture.

Signed-off-by: Mike Rapoport <rppt@linux.ibm.com>
Acked-by: Arnd Bergmann <arnd@arndb.de>
Acked-by: Guenter Roeck <linux@roeck-us.net>
---
 MAINTAINERS                   |   1 -
 drivers/video/fbdev/Kconfig   |  11 -
 drivers/video/fbdev/Makefile  |   1 -
 drivers/video/fbdev/fb-puv3.c | 836 ----------------------------------
 4 files changed, 849 deletions(-)
 delete mode 100644 drivers/video/fbdev/fb-puv3.c

diff --git a/MAINTAINERS b/MAINTAINERS
index e5035fda296e..79d70acdf119 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -13586,7 +13586,6 @@ S:	Maintained
 W:	http://mprc.pku.edu.cn/~guanxuetao/linux
 T:	git git://github.com/gxt/linux.git
 F:	drivers/rtc/rtc-puv3.c
-F:	drivers/video/fbdev/fb-puv3.c
 
 PLANTOWER PMS7003 AIR POLLUTION SENSOR DRIVER
 M:	Tomasz Duszynski <tduszyns@gmail.com>
diff --git a/drivers/video/fbdev/Kconfig b/drivers/video/fbdev/Kconfig
index 0f559aeaf469..32a2698914c3 100644
--- a/drivers/video/fbdev/Kconfig
+++ b/drivers/video/fbdev/Kconfig
@@ -2198,17 +2198,6 @@ config FB_BROADSHEET
 	  and could also have been called by other names when coupled with
 	  a bridge adapter.
 
-config FB_PUV3_UNIGFX
-	tristate "PKUnity v3 Unigfx framebuffer support"
-	depends on FB && UNICORE32 && ARCH_PUV3
-	select FB_SYS_FILLRECT
-	select FB_SYS_COPYAREA
-	select FB_SYS_IMAGEBLIT
-	select FB_SYS_FOPS
-	help
-	  Choose this option if you want to use the Unigfx device as a
-	  framebuffer device. Without the support of PCI & AGP.
-
 config FB_HYPERV
 	tristate "Microsoft Hyper-V Synthetic Video support"
 	depends on FB && HYPERV
diff --git a/drivers/video/fbdev/Makefile b/drivers/video/fbdev/Makefile
index aa6352798cf4..a0705b99e643 100644
--- a/drivers/video/fbdev/Makefile
+++ b/drivers/video/fbdev/Makefile
@@ -116,7 +116,6 @@ obj-y                             += omap2/
 obj-$(CONFIG_XEN_FBDEV_FRONTEND)  += xen-fbfront.o
 obj-$(CONFIG_FB_CARMINE)          += carminefb.o
 obj-$(CONFIG_FB_MB862XX)	  += mb862xx/
-obj-$(CONFIG_FB_PUV3_UNIGFX)      += fb-puv3.o
 obj-$(CONFIG_FB_HYPERV)		  += hyperv_fb.o
 obj-$(CONFIG_FB_OPENCORES)	  += ocfb.o
 obj-$(CONFIG_FB_SM712)		  += sm712fb.o
diff --git a/drivers/video/fbdev/fb-puv3.c b/drivers/video/fbdev/fb-puv3.c
deleted file mode 100644
index 030e85c11a78..000000000000
--- a/drivers/video/fbdev/fb-puv3.c
+++ /dev/null
@@ -1,836 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * Frame Buffer Driver for PKUnity-v3 Unigfx
- * Code specific to PKUnity SoC and UniCore ISA
- *
- *	Maintained by GUAN Xue-tao <gxt@mprc.pku.edu.cn>
- *	Copyright (C) 2001-2010 Guan Xuetao
- */
-
-#include <linux/module.h>
-#include <linux/kernel.h>
-#include <linux/errno.h>
-#include <linux/platform_device.h>
-#include <linux/clk.h>
-#include <linux/fb.h>
-#include <linux/init.h>
-#include <linux/console.h>
-#include <linux/mm.h>
-
-#include <linux/sizes.h>
-#include <mach/hardware.h>
-
-/* Platform_data reserved for unifb registers. */
-#define UNIFB_REGS_NUM		10
-/* RAM reserved for the frame buffer. */
-#define UNIFB_MEMSIZE		(SZ_4M)		/* 4 MB for 1024*768*32b */
-
-/*
- * cause UNIGFX don not have EDID
- * all the modes are organized as follow
- */
-static const struct fb_videomode unifb_modes[] = {
-	/* 0 640x480-60 VESA */
-	{ "640x480@60",  60,  640, 480,  25175000,  48, 16, 34, 10,  96, 1,
-	  0, FB_VMODE_NONINTERLACED, FB_MODE_IS_VESA },
-	/* 1 640x480-75 VESA */
-	{ "640x480@75",  75,  640, 480,  31500000, 120, 16, 18,  1,  64, 1,
-	  0, FB_VMODE_NONINTERLACED, FB_MODE_IS_VESA },
-	/* 2 800x600-60 VESA */
-	{ "800x600@60",  60,  800, 600,  40000000,  88, 40, 26,  1, 128, 1,
-	  0, FB_VMODE_NONINTERLACED, FB_MODE_IS_VESA },
-	/* 3 800x600-75 VESA */
-	{ "800x600@75",  75,  800, 600,  49500000, 160, 16, 23,  1,  80, 1,
-	  0, FB_VMODE_NONINTERLACED, FB_MODE_IS_VESA },
-	/* 4 1024x768-60 VESA */
-	{ "1024x768@60", 60, 1024, 768,  65000000, 160, 24, 34,  3, 136, 1,
-	  0, FB_VMODE_NONINTERLACED, FB_MODE_IS_VESA },
-	/* 5 1024x768-75 VESA */
-	{ "1024x768@75", 75, 1024, 768,  78750000, 176, 16, 30,  1,  96, 1,
-	  0, FB_VMODE_NONINTERLACED, FB_MODE_IS_VESA },
-	/* 6 1280x960-60 VESA */
-	{ "1280x960@60", 60, 1280, 960, 108000000, 312, 96, 38,  1, 112, 1,
-	  0, FB_VMODE_NONINTERLACED, FB_MODE_IS_VESA },
-	/* 7 1440x900-60 VESA */
-	{ "1440x900@60", 60, 1440, 900, 106500000, 232, 80, 30,  3, 152, 1,
-	  0, FB_VMODE_NONINTERLACED, FB_MODE_IS_VESA },
-	/* 8 FIXME 9 1024x600-60 VESA UNTESTED */
-	{ "1024x600@60", 60, 1024, 600,  50650000, 160, 24, 26,  1, 136, 1,
-	  0, FB_VMODE_NONINTERLACED, FB_MODE_IS_VESA },
-	/* 9 FIXME 10 1024x600-75 VESA UNTESTED */
-	{ "1024x600@75", 75, 1024, 600,  61500000, 176, 16, 23,  1,  96, 1,
-	  0, FB_VMODE_NONINTERLACED, FB_MODE_IS_VESA },
-	/* 10 FIXME 11 1366x768-60 VESA UNTESTED */
-	{ "1366x768@60", 60, 1366, 768,  85500000, 256, 58, 18,  1,  112, 3,
-	  0, FB_VMODE_NONINTERLACED, FB_MODE_IS_VESA },
-};
-
-static const struct fb_var_screeninfo unifb_default = {
-	.xres =		640,
-	.yres =		480,
-	.xres_virtual =	640,
-	.yres_virtual =	480,
-	.bits_per_pixel = 16,
-	.red =		{ 11, 5, 0 },
-	.green =	{ 5,  6, 0 },
-	.blue =		{ 0,  5, 0 },
-	.activate =	FB_ACTIVATE_NOW,
-	.height =	-1,
-	.width =	-1,
-	.pixclock =	25175000,
-	.left_margin =	48,
-	.right_margin =	16,
-	.upper_margin =	33,
-	.lower_margin =	10,
-	.hsync_len =	96,
-	.vsync_len =	2,
-	.vmode =	FB_VMODE_NONINTERLACED,
-};
-
-static struct fb_fix_screeninfo unifb_fix = {
-	.id =		"UNIGFX FB",
-	.type =		FB_TYPE_PACKED_PIXELS,
-	.visual =	FB_VISUAL_TRUECOLOR,
-	.xpanstep =	1,
-	.ypanstep =	1,
-	.ywrapstep =	1,
-	.accel =	FB_ACCEL_NONE,
-};
-
-static void unifb_sync(struct fb_info *info)
-{
-	/* TODO: may, this can be replaced by interrupt */
-	int cnt;
-
-	for (cnt = 0; cnt < 0x10000000; cnt++) {
-		if (readl(UGE_COMMAND) & 0x1000000)
-			return;
-	}
-
-	if (cnt > 0x8000000)
-		dev_warn(info->device, "Warning: UniGFX GE time out ...\n");
-}
-
-static void unifb_prim_fillrect(struct fb_info *info,
-				const struct fb_fillrect *region)
-{
-	int awidth = region->width;
-	int aheight = region->height;
-	int m_iBpp = info->var.bits_per_pixel;
-	int screen_width = info->var.xres;
-	int src_sel = 1;	/* from fg_color */
-	int pat_sel = 1;
-	int src_x0 = 0;
-	int dst_x0 = region->dx;
-	int src_y0 = 0;
-	int dst_y0 = region->dy;
-	int rop_alpha_sel = 0;
-	int rop_alpha_code = 0xCC;
-	int x_dir = 1;
-	int y_dir = 1;
-	int alpha_r = 0;
-	int alpha_sel = 0;
-	int dst_pitch = screen_width * (m_iBpp / 8);
-	int dst_offset = dst_y0 * dst_pitch + dst_x0 * (m_iBpp / 8);
-	int src_pitch = screen_width * (m_iBpp / 8);
-	int src_offset = src_y0 * src_pitch + src_x0 * (m_iBpp / 8);
-	unsigned int command = 0;
-	int clip_region = 0;
-	int clip_en = 0;
-	int tp_en = 0;
-	int fg_color = 0;
-	int bottom = info->var.yres - 1;
-	int right = info->var.xres - 1;
-	int top = 0;
-
-	bottom = (bottom << 16) | right;
-	command = (rop_alpha_sel << 26) | (pat_sel << 18) | (src_sel << 16)
-		| (x_dir << 20) | (y_dir << 21) | (command << 24)
-		| (clip_region << 23) | (clip_en << 22) | (tp_en << 27);
-	src_pitch = (dst_pitch << 16) | src_pitch;
-	awidth = awidth | (aheight << 16);
-	alpha_r = ((rop_alpha_code & 0xff) << 8) | (alpha_r & 0xff)
-		| (alpha_sel << 16);
-	src_x0 = (src_x0 & 0x1fff) | ((src_y0 & 0x1fff) << 16);
-	dst_x0 = (dst_x0 & 0x1fff) | ((dst_y0 & 0x1fff) << 16);
-	fg_color = region->color;
-
-	unifb_sync(info);
-
-	writel(((u32 *)(info->pseudo_palette))[fg_color], UGE_FCOLOR);
-	writel(0, UGE_BCOLOR);
-	writel(src_pitch, UGE_PITCH);
-	writel(src_offset, UGE_SRCSTART);
-	writel(dst_offset, UGE_DSTSTART);
-	writel(awidth, UGE_WIDHEIGHT);
-	writel(top, UGE_CLIP0);
-	writel(bottom, UGE_CLIP1);
-	writel(alpha_r, UGE_ROPALPHA);
-	writel(src_x0, UGE_SRCXY);
-	writel(dst_x0, UGE_DSTXY);
-	writel(command, UGE_COMMAND);
-}
-
-static void unifb_fillrect(struct fb_info *info,
-		const struct fb_fillrect *region)
-{
-	struct fb_fillrect modded;
-	int vxres, vyres;
-
-	if (info->flags & FBINFO_HWACCEL_DISABLED) {
-		sys_fillrect(info, region);
-		return;
-	}
-
-	vxres = info->var.xres_virtual;
-	vyres = info->var.yres_virtual;
-
-	memcpy(&modded, region, sizeof(struct fb_fillrect));
-
-	if (!modded.width || !modded.height ||
-	    modded.dx >= vxres || modded.dy >= vyres)
-		return;
-
-	if (modded.dx + modded.width > vxres)
-		modded.width = vxres - modded.dx;
-	if (modded.dy + modded.height > vyres)
-		modded.height = vyres - modded.dy;
-
-	unifb_prim_fillrect(info, &modded);
-}
-
-static void unifb_prim_copyarea(struct fb_info *info,
-				const struct fb_copyarea *area)
-{
-	int awidth = area->width;
-	int aheight = area->height;
-	int m_iBpp = info->var.bits_per_pixel;
-	int screen_width = info->var.xres;
-	int src_sel = 2;	/* from mem */
-	int pat_sel = 0;
-	int src_x0 = area->sx;
-	int dst_x0 = area->dx;
-	int src_y0 = area->sy;
-	int dst_y0 = area->dy;
-
-	int rop_alpha_sel = 0;
-	int rop_alpha_code = 0xCC;
-	int x_dir = 1;
-	int y_dir = 1;
-
-	int alpha_r = 0;
-	int alpha_sel = 0;
-	int dst_pitch = screen_width * (m_iBpp / 8);
-	int dst_offset = dst_y0 * dst_pitch + dst_x0 * (m_iBpp / 8);
-	int src_pitch = screen_width * (m_iBpp / 8);
-	int src_offset = src_y0 * src_pitch + src_x0 * (m_iBpp / 8);
-	unsigned int command = 0;
-	int clip_region = 0;
-	int clip_en = 1;
-	int tp_en = 0;
-	int top = 0;
-	int bottom = info->var.yres;
-	int right = info->var.xres;
-	int fg_color = 0;
-	int bg_color = 0;
-
-	if (src_x0 < 0)
-		src_x0 = 0;
-	if (src_y0 < 0)
-		src_y0 = 0;
-
-	if (src_y0 - dst_y0 > 0) {
-		y_dir = 1;
-	} else {
-		y_dir = 0;
-		src_offset = (src_y0 + aheight) * src_pitch +
-				src_x0 * (m_iBpp / 8);
-		dst_offset = (dst_y0 + aheight) * dst_pitch +
-				dst_x0 * (m_iBpp / 8);
-		src_y0 += aheight;
-		dst_y0 += aheight;
-	}
-
-	command = (rop_alpha_sel << 26) | (pat_sel << 18) | (src_sel << 16) |
-		(x_dir << 20) | (y_dir << 21) | (command << 24) |
-		(clip_region << 23) | (clip_en << 22) | (tp_en << 27);
-	src_pitch = (dst_pitch << 16) | src_pitch;
-	awidth = awidth | (aheight << 16);
-	alpha_r = ((rop_alpha_code & 0xff) << 8) | (alpha_r & 0xff) |
-		(alpha_sel << 16);
-	src_x0 = (src_x0 & 0x1fff) | ((src_y0 & 0x1fff) << 16);
-	dst_x0 = (dst_x0 & 0x1fff) | ((dst_y0 & 0x1fff) << 16);
-	bottom = (bottom << 16) | right;
-
-	unifb_sync(info);
-
-	writel(src_pitch, UGE_PITCH);
-	writel(src_offset, UGE_SRCSTART);
-	writel(dst_offset, UGE_DSTSTART);
-	writel(awidth, UGE_WIDHEIGHT);
-	writel(top, UGE_CLIP0);
-	writel(bottom, UGE_CLIP1);
-	writel(bg_color, UGE_BCOLOR);
-	writel(fg_color, UGE_FCOLOR);
-	writel(alpha_r, UGE_ROPALPHA);
-	writel(src_x0, UGE_SRCXY);
-	writel(dst_x0, UGE_DSTXY);
-	writel(command, UGE_COMMAND);
-}
-
-static void unifb_copyarea(struct fb_info *info, const struct fb_copyarea *area)
-{
-	struct fb_copyarea modded;
-	u32 vxres, vyres;
-	modded.sx = area->sx;
-	modded.sy = area->sy;
-	modded.dx = area->dx;
-	modded.dy = area->dy;
-	modded.width = area->width;
-	modded.height = area->height;
-
-	if (info->flags & FBINFO_HWACCEL_DISABLED) {
-		sys_copyarea(info, area);
-		return;
-	}
-
-	vxres = info->var.xres_virtual;
-	vyres = info->var.yres_virtual;
-
-	if (!modded.width || !modded.height ||
-	    modded.sx >= vxres || modded.sy >= vyres ||
-	    modded.dx >= vxres || modded.dy >= vyres)
-		return;
-
-	if (modded.sx + modded.width > vxres)
-		modded.width = vxres - modded.sx;
-	if (modded.dx + modded.width > vxres)
-		modded.width = vxres - modded.dx;
-	if (modded.sy + modded.height > vyres)
-		modded.height = vyres - modded.sy;
-	if (modded.dy + modded.height > vyres)
-		modded.height = vyres - modded.dy;
-
-	unifb_prim_copyarea(info, &modded);
-}
-
-static void unifb_imageblit(struct fb_info *info, const struct fb_image *image)
-{
-	sys_imageblit(info, image);
-}
-
-static u_long get_line_length(int xres_virtual, int bpp)
-{
-	u_long length;
-
-	length = xres_virtual * bpp;
-	length = (length + 31) & ~31;
-	length >>= 3;
-	return length;
-}
-
-/*
- *  Setting the video mode has been split into two parts.
- *  First part, xxxfb_check_var, must not write anything
- *  to hardware, it should only verify and adjust var.
- *  This means it doesn't alter par but it does use hardware
- *  data from it to check this var.
- */
-static int unifb_check_var(struct fb_var_screeninfo *var,
-			 struct fb_info *info)
-{
-	u_long line_length;
-
-	/*
-	 *  FB_VMODE_CONUPDATE and FB_VMODE_SMOOTH_XPAN are equal!
-	 *  as FB_VMODE_SMOOTH_XPAN is only used internally
-	 */
-
-	if (var->vmode & FB_VMODE_CONUPDATE) {
-		var->vmode |= FB_VMODE_YWRAP;
-		var->xoffset = info->var.xoffset;
-		var->yoffset = info->var.yoffset;
-	}
-
-	/*
-	 *  Some very basic checks
-	 */
-	if (!var->xres)
-		var->xres = 1;
-	if (!var->yres)
-		var->yres = 1;
-	if (var->xres > var->xres_virtual)
-		var->xres_virtual = var->xres;
-	if (var->yres > var->yres_virtual)
-		var->yres_virtual = var->yres;
-	if (var->bits_per_pixel <= 1)
-		var->bits_per_pixel = 1;
-	else if (var->bits_per_pixel <= 8)
-		var->bits_per_pixel = 8;
-	else if (var->bits_per_pixel <= 16)
-		var->bits_per_pixel = 16;
-	else if (var->bits_per_pixel <= 24)
-		var->bits_per_pixel = 24;
-	else if (var->bits_per_pixel <= 32)
-		var->bits_per_pixel = 32;
-	else
-		return -EINVAL;
-
-	if (var->xres_virtual < var->xoffset + var->xres)
-		var->xres_virtual = var->xoffset + var->xres;
-	if (var->yres_virtual < var->yoffset + var->yres)
-		var->yres_virtual = var->yoffset + var->yres;
-
-	/*
-	 *  Memory limit
-	 */
-	line_length =
-	    get_line_length(var->xres_virtual, var->bits_per_pixel);
-	if (line_length * var->yres_virtual > UNIFB_MEMSIZE)
-		return -ENOMEM;
-
-	/*
-	 * Now that we checked it we alter var. The reason being is that the
-	 * video mode passed in might not work but slight changes to it might
-	 * make it work. This way we let the user know what is acceptable.
-	 */
-	switch (var->bits_per_pixel) {
-	case 1:
-	case 8:
-		var->red.offset = 0;
-		var->red.length = 8;
-		var->green.offset = 0;
-		var->green.length = 8;
-		var->blue.offset = 0;
-		var->blue.length = 8;
-		var->transp.offset = 0;
-		var->transp.length = 0;
-		break;
-	case 16:		/* RGBA 5551 */
-		if (var->transp.length) {
-			var->red.offset = 0;
-			var->red.length = 5;
-			var->green.offset = 5;
-			var->green.length = 5;
-			var->blue.offset = 10;
-			var->blue.length = 5;
-			var->transp.offset = 15;
-			var->transp.length = 1;
-		} else {	/* RGB 565 */
-			var->red.offset = 11;
-			var->red.length = 5;
-			var->green.offset = 5;
-			var->green.length = 6;
-			var->blue.offset = 0;
-			var->blue.length = 5;
-			var->transp.offset = 0;
-			var->transp.length = 0;
-		}
-		break;
-	case 24:		/* RGB 888 */
-		var->red.offset = 0;
-		var->red.length = 8;
-		var->green.offset = 8;
-		var->green.length = 8;
-		var->blue.offset = 16;
-		var->blue.length = 8;
-		var->transp.offset = 0;
-		var->transp.length = 0;
-		break;
-	case 32:		/* RGBA 8888 */
-		var->red.offset = 16;
-		var->red.length = 8;
-		var->green.offset = 8;
-		var->green.length = 8;
-		var->blue.offset = 0;
-		var->blue.length = 8;
-		var->transp.offset = 24;
-		var->transp.length = 8;
-		break;
-	}
-	var->red.msb_right = 0;
-	var->green.msb_right = 0;
-	var->blue.msb_right = 0;
-	var->transp.msb_right = 0;
-
-	return 0;
-}
-
-/*
- * This routine actually sets the video mode. It's in here where we
- * the hardware state info->par and fix which can be affected by the
- * change in par. For this driver it doesn't do much.
- */
-static int unifb_set_par(struct fb_info *info)
-{
-	int hTotal, vTotal, hSyncStart, hSyncEnd, vSyncStart, vSyncEnd;
-	int format;
-
-#ifdef CONFIG_PUV3_PM
-	struct clk *clk_vga;
-	u32 pixclk = 0;
-	int i;
-
-	for (i = 0; i <= 10; i++) {
-		if    (info->var.xres         == unifb_modes[i].xres
-		    && info->var.yres         == unifb_modes[i].yres
-		    && info->var.upper_margin == unifb_modes[i].upper_margin
-		    && info->var.lower_margin == unifb_modes[i].lower_margin
-		    && info->var.left_margin  == unifb_modes[i].left_margin
-		    && info->var.right_margin == unifb_modes[i].right_margin
-		    && info->var.hsync_len    == unifb_modes[i].hsync_len
-		    && info->var.vsync_len    == unifb_modes[i].vsync_len) {
-			pixclk = unifb_modes[i].pixclock;
-			break;
-		}
-	}
-
-	/* set clock rate */
-	clk_vga = clk_get(info->device, "VGA_CLK");
-	if (clk_vga == ERR_PTR(-ENOENT))
-		return -ENOENT;
-
-	if (pixclk != 0) {
-		if (clk_set_rate(clk_vga, pixclk)) { /* set clock failed */
-			info->fix = unifb_fix;
-			info->var = unifb_default;
-			if (clk_set_rate(clk_vga, unifb_default.pixclock))
-				return -EINVAL;
-		}
-	}
-#endif
-
-	info->fix.line_length = get_line_length(info->var.xres_virtual,
-						info->var.bits_per_pixel);
-
-	hSyncStart = info->var.xres + info->var.right_margin;
-	hSyncEnd = hSyncStart + info->var.hsync_len;
-	hTotal = hSyncEnd + info->var.left_margin;
-
-	vSyncStart = info->var.yres + info->var.lower_margin;
-	vSyncEnd = vSyncStart + info->var.vsync_len;
-	vTotal = vSyncEnd + info->var.upper_margin;
-
-	switch (info->var.bits_per_pixel) {
-	case 8:
-		format = UDE_CFG_DST8;
-		break;
-	case 16:
-		format = UDE_CFG_DST16;
-		break;
-	case 24:
-		format = UDE_CFG_DST24;
-		break;
-	case 32:
-		format = UDE_CFG_DST32;
-		break;
-	default:
-		return -EINVAL;
-	}
-
-	writel(info->fix.smem_start, UDE_FSA);
-	writel(info->var.yres, UDE_LS);
-	writel(get_line_length(info->var.xres,
-			info->var.bits_per_pixel) >> 3, UDE_PS);
-			/* >> 3 for hardware required. */
-	writel((hTotal << 16) | (info->var.xres), UDE_HAT);
-	writel(((hTotal - 1) << 16) | (info->var.xres - 1), UDE_HBT);
-	writel(((hSyncEnd - 1) << 16) | (hSyncStart - 1), UDE_HST);
-	writel((vTotal << 16) | (info->var.yres), UDE_VAT);
-	writel(((vTotal - 1) << 16) | (info->var.yres - 1), UDE_VBT);
-	writel(((vSyncEnd - 1) << 16) | (vSyncStart - 1), UDE_VST);
-	writel(UDE_CFG_GDEN_ENABLE | UDE_CFG_TIMEUP_ENABLE
-			| format | 0xC0000001, UDE_CFG);
-
-	return 0;
-}
-
-/*
- *  Set a single color register. The values supplied are already
- *  rounded down to the hardware's capabilities (according to the
- *  entries in the var structure). Return != 0 for invalid regno.
- */
-static int unifb_setcolreg(u_int regno, u_int red, u_int green, u_int blue,
-			 u_int transp, struct fb_info *info)
-{
-	if (regno >= 256)	/* no. of hw registers */
-		return 1;
-
-	/* grayscale works only partially under directcolor */
-	if (info->var.grayscale) {
-		/* grayscale = 0.30*R + 0.59*G + 0.11*B */
-		red = green = blue =
-		    (red * 77 + green * 151 + blue * 28) >> 8;
-	}
-
-#define CNVT_TOHW(val, width) ((((val)<<(width))+0x7FFF-(val))>>16)
-	switch (info->fix.visual) {
-	case FB_VISUAL_TRUECOLOR:
-	case FB_VISUAL_PSEUDOCOLOR:
-		red = CNVT_TOHW(red, info->var.red.length);
-		green = CNVT_TOHW(green, info->var.green.length);
-		blue = CNVT_TOHW(blue, info->var.blue.length);
-		transp = CNVT_TOHW(transp, info->var.transp.length);
-		break;
-	case FB_VISUAL_DIRECTCOLOR:
-		red = CNVT_TOHW(red, 8);	/* expect 8 bit DAC */
-		green = CNVT_TOHW(green, 8);
-		blue = CNVT_TOHW(blue, 8);
-		/* hey, there is bug in transp handling... */
-		transp = CNVT_TOHW(transp, 8);
-		break;
-	}
-#undef CNVT_TOHW
-	/* Truecolor has hardware independent palette */
-	if (info->fix.visual == FB_VISUAL_TRUECOLOR) {
-		u32 v;
-
-		if (regno >= 16)
-			return 1;
-
-		v = (red << info->var.red.offset) |
-		    (green << info->var.green.offset) |
-		    (blue << info->var.blue.offset) |
-		    (transp << info->var.transp.offset);
-		switch (info->var.bits_per_pixel) {
-		case 8:
-			break;
-		case 16:
-		case 24:
-		case 32:
-			((u32 *) (info->pseudo_palette))[regno] = v;
-			break;
-		default:
-			return 1;
-		}
-		return 0;
-	}
-	return 0;
-}
-
-/*
- *  Pan or Wrap the Display
- *
- *  This call looks only at xoffset, yoffset and the FB_VMODE_YWRAP flag
- */
-static int unifb_pan_display(struct fb_var_screeninfo *var,
-			   struct fb_info *info)
-{
-	if (var->vmode & FB_VMODE_YWRAP) {
-		if (var->yoffset < 0
-		    || var->yoffset >= info->var.yres_virtual
-		    || var->xoffset)
-			return -EINVAL;
-	} else {
-		if (var->xoffset + info->var.xres > info->var.xres_virtual ||
-		    var->yoffset + info->var.yres > info->var.yres_virtual)
-			return -EINVAL;
-	}
-	info->var.xoffset = var->xoffset;
-	info->var.yoffset = var->yoffset;
-	if (var->vmode & FB_VMODE_YWRAP)
-		info->var.vmode |= FB_VMODE_YWRAP;
-	else
-		info->var.vmode &= ~FB_VMODE_YWRAP;
-	return 0;
-}
-
-int unifb_mmap(struct fb_info *info,
-		    struct vm_area_struct *vma)
-{
-	vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
-
-	return vm_iomap_memory(vma, info->fix.smem_start, info->fix.smem_len);
-}
-
-static const struct fb_ops unifb_ops = {
-	.fb_read        = fb_sys_read,
-	.fb_write       = fb_sys_write,
-	.fb_check_var	= unifb_check_var,
-	.fb_set_par	= unifb_set_par,
-	.fb_setcolreg	= unifb_setcolreg,
-	.fb_pan_display	= unifb_pan_display,
-	.fb_fillrect	= unifb_fillrect,
-	.fb_copyarea	= unifb_copyarea,
-	.fb_imageblit   = unifb_imageblit,
-	.fb_mmap	= unifb_mmap,
-};
-
-/*
- *  Initialisation
- */
-static int unifb_probe(struct platform_device *dev)
-{
-	struct fb_info *info;
-	u32 unifb_regs[UNIFB_REGS_NUM];
-	int retval = -ENOMEM;
-	struct resource *iomem;
-	void *videomemory;
-
-	videomemory = (void *)__get_free_pages(GFP_KERNEL | __GFP_COMP,
-				get_order(UNIFB_MEMSIZE));
-	if (!videomemory)
-		goto err;
-
-	memset(videomemory, 0, UNIFB_MEMSIZE);
-
-	unifb_fix.smem_start = virt_to_phys(videomemory);
-	unifb_fix.smem_len = UNIFB_MEMSIZE;
-
-	iomem = platform_get_resource(dev, IORESOURCE_MEM, 0);
-	unifb_fix.mmio_start = iomem->start;
-
-	info = framebuffer_alloc(sizeof(u32)*256, &dev->dev);
-	if (!info)
-		goto err;
-
-	info->screen_base = (char __iomem *)videomemory;
-	info->fbops = &unifb_ops;
-
-	retval = fb_find_mode(&info->var, info, NULL,
-			      unifb_modes, 10, &unifb_modes[0], 16);
-
-	if (!retval || (retval == 4))
-		info->var = unifb_default;
-
-	info->fix = unifb_fix;
-	info->pseudo_palette = info->par;
-	info->par = NULL;
-	info->flags = FBINFO_FLAG_DEFAULT;
-#ifdef FB_ACCEL_PUV3_UNIGFX
-	info->fix.accel = FB_ACCEL_PUV3_UNIGFX;
-#endif
-
-	retval = fb_alloc_cmap(&info->cmap, 256, 0);
-	if (retval < 0)
-		goto err1;
-
-	retval = register_framebuffer(info);
-	if (retval < 0)
-		goto err2;
-	platform_set_drvdata(dev, info);
-	platform_device_add_data(dev, unifb_regs, sizeof(u32) * UNIFB_REGS_NUM);
-
-	fb_info(info, "Virtual frame buffer device, using %dM of video memory\n",
-		UNIFB_MEMSIZE >> 20);
-	return 0;
-err2:
-	fb_dealloc_cmap(&info->cmap);
-err1:
-	framebuffer_release(info);
-err:
-	return retval;
-}
-
-static int unifb_remove(struct platform_device *dev)
-{
-	struct fb_info *info = platform_get_drvdata(dev);
-
-	if (info) {
-		unregister_framebuffer(info);
-		fb_dealloc_cmap(&info->cmap);
-		framebuffer_release(info);
-	}
-	return 0;
-}
-
-#ifdef CONFIG_PM
-static int unifb_resume(struct platform_device *dev)
-{
-	int rc = 0;
-	u32 *unifb_regs = dev->dev.platform_data;
-
-	if (dev->dev.power.power_state.event == PM_EVENT_ON)
-		return 0;
-
-	console_lock();
-
-	if (dev->dev.power.power_state.event == PM_EVENT_SUSPEND) {
-		writel(unifb_regs[0], UDE_FSA);
-		writel(unifb_regs[1], UDE_LS);
-		writel(unifb_regs[2], UDE_PS);
-		writel(unifb_regs[3], UDE_HAT);
-		writel(unifb_regs[4], UDE_HBT);
-		writel(unifb_regs[5], UDE_HST);
-		writel(unifb_regs[6], UDE_VAT);
-		writel(unifb_regs[7], UDE_VBT);
-		writel(unifb_regs[8], UDE_VST);
-		writel(unifb_regs[9], UDE_CFG);
-	}
-	dev->dev.power.power_state = PMSG_ON;
-
-	console_unlock();
-
-	return rc;
-}
-
-static int unifb_suspend(struct platform_device *dev, pm_message_t mesg)
-{
-	u32 *unifb_regs = dev->dev.platform_data;
-
-	unifb_regs[0] = readl(UDE_FSA);
-	unifb_regs[1] = readl(UDE_LS);
-	unifb_regs[2] = readl(UDE_PS);
-	unifb_regs[3] = readl(UDE_HAT);
-	unifb_regs[4] = readl(UDE_HBT);
-	unifb_regs[5] = readl(UDE_HST);
-	unifb_regs[6] = readl(UDE_VAT);
-	unifb_regs[7] = readl(UDE_VBT);
-	unifb_regs[8] = readl(UDE_VST);
-	unifb_regs[9] = readl(UDE_CFG);
-
-	if (mesg.event == dev->dev.power.power_state.event)
-		return 0;
-
-	switch (mesg.event) {
-	case PM_EVENT_FREEZE:		/* about to take snapshot */
-	case PM_EVENT_PRETHAW:		/* before restoring snapshot */
-		goto done;
-	}
-
-	console_lock();
-
-	/* do nothing... */
-
-	console_unlock();
-
-done:
-	dev->dev.power.power_state = mesg;
-
-	return 0;
-}
-#else
-#define	unifb_resume	NULL
-#define unifb_suspend	NULL
-#endif
-
-static struct platform_driver unifb_driver = {
-	.probe	 = unifb_probe,
-	.remove  = unifb_remove,
-	.resume  = unifb_resume,
-	.suspend = unifb_suspend,
-	.driver  = {
-		.name	= "PKUnity-v3-UNIGFX",
-	},
-};
-
-static int __init unifb_init(void)
-{
-#ifndef MODULE
-	if (fb_get_options("unifb", NULL))
-		return -ENODEV;
-#endif
-
-	return platform_driver_register(&unifb_driver);
-}
-
-module_init(unifb_init);
-
-static void __exit unifb_exit(void)
-{
-	platform_driver_unregister(&unifb_driver);
-}
-
-module_exit(unifb_exit);
-
-MODULE_LICENSE("GPL v2");

From fa4b9519f074646252f6aeb33d9329a384439632 Mon Sep 17 00:00:00 2001
From: Mike Rapoport <rppt@linux.ibm.com>
Date: Wed, 10 Jun 2020 09:57:50 +0300
Subject: [PATCH 235/502] rtc: remove fb-puv3  driver

The unicore32 port is removed from the kernel.
There is no point to keep stale RTC driver for this architecture.

Signed-off-by: Mike Rapoport <rppt@linux.ibm.com>
Acked-by: Arnd Bergmann <arnd@arndb.de>
Acked-by: Guenter Roeck <linux@roeck-us.net>
---
 MAINTAINERS            |   1 -
 drivers/rtc/Kconfig    |   9 --
 drivers/rtc/Makefile   |   1 -
 drivers/rtc/rtc-puv3.c | 286 -----------------------------------------
 4 files changed, 297 deletions(-)
 delete mode 100644 drivers/rtc/rtc-puv3.c

diff --git a/MAINTAINERS b/MAINTAINERS
index 79d70acdf119..e4787ac42153 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -13585,7 +13585,6 @@ M:	Guan Xuetao <gxt@pku.edu.cn>
 S:	Maintained
 W:	http://mprc.pku.edu.cn/~guanxuetao/linux
 T:	git git://github.com/gxt/linux.git
-F:	drivers/rtc/rtc-puv3.c
 
 PLANTOWER PMS7003 AIR POLLUTION SENSOR DRIVER
 M:	Tomasz Duszynski <tduszyns@gmail.com>
diff --git a/drivers/rtc/Kconfig b/drivers/rtc/Kconfig
index b54d87d45c89..f3b8e6dcd879 100644
--- a/drivers/rtc/Kconfig
+++ b/drivers/rtc/Kconfig
@@ -1729,15 +1729,6 @@ config RTC_DRV_TEGRA
 	  This drive can also be built as a module. If so, the module
 	  will be called rtc-tegra.
 
-config RTC_DRV_PUV3
-	tristate "PKUnity v3 RTC support"
-	depends on ARCH_PUV3
-	help
-	  This enables support for the RTC in the PKUnity-v3 SoCs.
-
-	  This drive can also be built as a module. If so, the module
-	  will be called rtc-puv3.
-
 config RTC_DRV_LOONGSON1
 	tristate "loongson1 RTC support"
 	depends on MACH_LOONGSON32
diff --git a/drivers/rtc/Makefile b/drivers/rtc/Makefile
index 0721752c6ed4..880e08a409c3 100644
--- a/drivers/rtc/Makefile
+++ b/drivers/rtc/Makefile
@@ -128,7 +128,6 @@ obj-$(CONFIG_RTC_DRV_PL030)	+= rtc-pl030.o
 obj-$(CONFIG_RTC_DRV_PL031)	+= rtc-pl031.o
 obj-$(CONFIG_RTC_DRV_PM8XXX)	+= rtc-pm8xxx.o
 obj-$(CONFIG_RTC_DRV_PS3)	+= rtc-ps3.o
-obj-$(CONFIG_RTC_DRV_PUV3)	+= rtc-puv3.o
 obj-$(CONFIG_RTC_DRV_PXA)	+= rtc-pxa.o
 obj-$(CONFIG_RTC_DRV_R7301)	+= rtc-r7301.o
 obj-$(CONFIG_RTC_DRV_R9701)	+= rtc-r9701.o
diff --git a/drivers/rtc/rtc-puv3.c b/drivers/rtc/rtc-puv3.c
deleted file mode 100644
index 954b88d2485f..000000000000
--- a/drivers/rtc/rtc-puv3.c
+++ /dev/null
@@ -1,286 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * RTC driver code specific to PKUnity SoC and UniCore ISA
- *
- *	Maintained by GUAN Xue-tao <gxt@mprc.pku.edu.cn>
- *	Copyright (C) 2001-2010 Guan Xuetao
- */
-
-#include <linux/module.h>
-#include <linux/fs.h>
-#include <linux/string.h>
-#include <linux/init.h>
-#include <linux/platform_device.h>
-#include <linux/interrupt.h>
-#include <linux/rtc.h>
-#include <linux/bcd.h>
-#include <linux/clk.h>
-#include <linux/log2.h>
-#include <linux/slab.h>
-#include <linux/uaccess.h>
-#include <linux/io.h>
-
-#include <asm/irq.h>
-#include <mach/hardware.h>
-
-static struct resource *puv3_rtc_mem;
-
-static int puv3_rtc_alarmno = IRQ_RTCAlarm;
-static int puv3_rtc_tickno  = IRQ_RTC;
-
-static DEFINE_SPINLOCK(puv3_rtc_pie_lock);
-
-/* IRQ Handlers */
-static irqreturn_t puv3_rtc_alarmirq(int irq, void *id)
-{
-	struct rtc_device *rdev = id;
-
-	writel(readl(RTC_RTSR) | RTC_RTSR_AL, RTC_RTSR);
-	rtc_update_irq(rdev, 1, RTC_AF | RTC_IRQF);
-	return IRQ_HANDLED;
-}
-
-static irqreturn_t puv3_rtc_tickirq(int irq, void *id)
-{
-	struct rtc_device *rdev = id;
-
-	writel(readl(RTC_RTSR) | RTC_RTSR_HZ, RTC_RTSR);
-	rtc_update_irq(rdev, 1, RTC_PF | RTC_IRQF);
-	return IRQ_HANDLED;
-}
-
-/* Update control registers */
-static void puv3_rtc_setaie(struct device *dev, int to)
-{
-	unsigned int tmp;
-
-	dev_dbg(dev, "%s: aie=%d\n", __func__, to);
-
-	tmp = readl(RTC_RTSR) & ~RTC_RTSR_ALE;
-
-	if (to)
-		tmp |= RTC_RTSR_ALE;
-
-	writel(tmp, RTC_RTSR);
-}
-
-static int puv3_rtc_setpie(struct device *dev, int enabled)
-{
-	unsigned int tmp;
-
-	dev_dbg(dev, "%s: pie=%d\n", __func__, enabled);
-
-	spin_lock_irq(&puv3_rtc_pie_lock);
-	tmp = readl(RTC_RTSR) & ~RTC_RTSR_HZE;
-
-	if (enabled)
-		tmp |= RTC_RTSR_HZE;
-
-	writel(tmp, RTC_RTSR);
-	spin_unlock_irq(&puv3_rtc_pie_lock);
-
-	return 0;
-}
-
-/* Time read/write */
-static int puv3_rtc_gettime(struct device *dev, struct rtc_time *rtc_tm)
-{
-	rtc_time64_to_tm(readl(RTC_RCNR), rtc_tm);
-
-	dev_dbg(dev, "read time %ptRr\n", rtc_tm);
-
-	return 0;
-}
-
-static int puv3_rtc_settime(struct device *dev, struct rtc_time *tm)
-{
-	dev_dbg(dev, "set time %ptRr\n", tm);
-
-	writel(rtc_tm_to_time64(tm), RTC_RCNR);
-
-	return 0;
-}
-
-static int puv3_rtc_getalarm(struct device *dev, struct rtc_wkalrm *alrm)
-{
-	struct rtc_time *alm_tm = &alrm->time;
-
-	rtc_time64_to_tm(readl(RTC_RTAR), alm_tm);
-
-	alrm->enabled = readl(RTC_RTSR) & RTC_RTSR_ALE;
-
-	dev_dbg(dev, "read alarm: %d, %ptRr\n", alrm->enabled, alm_tm);
-
-	return 0;
-}
-
-static int puv3_rtc_setalarm(struct device *dev, struct rtc_wkalrm *alrm)
-{
-	struct rtc_time *tm = &alrm->time;
-
-	dev_dbg(dev, "set alarm: %d, %ptRr\n", alrm->enabled, tm);
-
-	writel(rtc_tm_to_time64(tm), RTC_RTAR);
-
-	puv3_rtc_setaie(dev, alrm->enabled);
-
-	if (alrm->enabled)
-		enable_irq_wake(puv3_rtc_alarmno);
-	else
-		disable_irq_wake(puv3_rtc_alarmno);
-
-	return 0;
-}
-
-static int puv3_rtc_proc(struct device *dev, struct seq_file *seq)
-{
-	seq_printf(seq, "periodic_IRQ\t: %s\n",
-		     (readl(RTC_RTSR) & RTC_RTSR_HZE) ? "yes" : "no");
-	return 0;
-}
-
-static const struct rtc_class_ops puv3_rtcops = {
-	.read_time	= puv3_rtc_gettime,
-	.set_time	= puv3_rtc_settime,
-	.read_alarm	= puv3_rtc_getalarm,
-	.set_alarm	= puv3_rtc_setalarm,
-	.proc	        = puv3_rtc_proc,
-};
-
-static void puv3_rtc_enable(struct device *dev, int en)
-{
-	if (!en) {
-		writel(readl(RTC_RTSR) & ~RTC_RTSR_HZE, RTC_RTSR);
-	} else {
-		/* re-enable the device, and check it is ok */
-		if ((readl(RTC_RTSR) & RTC_RTSR_HZE) == 0) {
-			dev_info(dev, "rtc disabled, re-enabling\n");
-			writel(readl(RTC_RTSR) | RTC_RTSR_HZE, RTC_RTSR);
-		}
-	}
-}
-
-static int puv3_rtc_remove(struct platform_device *dev)
-{
-	puv3_rtc_setpie(&dev->dev, 0);
-	puv3_rtc_setaie(&dev->dev, 0);
-
-	release_resource(puv3_rtc_mem);
-	kfree(puv3_rtc_mem);
-
-	return 0;
-}
-
-static int puv3_rtc_probe(struct platform_device *pdev)
-{
-	struct rtc_device *rtc;
-	struct resource *res;
-	int ret;
-
-	dev_dbg(&pdev->dev, "%s: probe=%p\n", __func__, pdev);
-
-	/* find the IRQs */
-	puv3_rtc_tickno = platform_get_irq(pdev, 1);
-	if (puv3_rtc_tickno < 0)
-		return -ENOENT;
-
-	puv3_rtc_alarmno = platform_get_irq(pdev, 0);
-	if (puv3_rtc_alarmno < 0)
-		return -ENOENT;
-
-	dev_dbg(&pdev->dev, "PKUnity_rtc: tick irq %d, alarm irq %d\n",
-		 puv3_rtc_tickno, puv3_rtc_alarmno);
-
-	rtc = devm_rtc_allocate_device(&pdev->dev);
-	if (IS_ERR(rtc))
-		return PTR_ERR(rtc);
-
-	ret = devm_request_irq(&pdev->dev, puv3_rtc_alarmno, puv3_rtc_alarmirq,
-			       0, "pkunity-rtc alarm", rtc);
-	if (ret) {
-		dev_err(&pdev->dev, "IRQ%d error %d\n", puv3_rtc_alarmno, ret);
-		return ret;
-	}
-
-	ret = devm_request_irq(&pdev->dev, puv3_rtc_tickno, puv3_rtc_tickirq,
-			       0, "pkunity-rtc tick", rtc);
-	if (ret) {
-		dev_err(&pdev->dev, "IRQ%d error %d\n", puv3_rtc_tickno, ret);
-		return ret;
-	}
-
-	/* get the memory region */
-	res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
-	if (res == NULL) {
-		dev_err(&pdev->dev, "failed to get memory region resource\n");
-		return -ENOENT;
-	}
-
-	puv3_rtc_mem = request_mem_region(res->start, resource_size(res),
-					  pdev->name);
-
-	if (puv3_rtc_mem == NULL) {
-		dev_err(&pdev->dev, "failed to reserve memory region\n");
-		ret = -ENOENT;
-		goto err_nores;
-	}
-
-	puv3_rtc_enable(&pdev->dev, 1);
-
-	/* register RTC and exit */
-	rtc->ops = &puv3_rtcops;
-	rtc->range_max = U32_MAX;
-	ret = rtc_register_device(rtc);
-	if (ret)
-		goto err_nortc;
-
-	/* platform setup code should have handled this; sigh */
-	if (!device_can_wakeup(&pdev->dev))
-		device_init_wakeup(&pdev->dev, 1);
-
-	platform_set_drvdata(pdev, rtc);
-	return 0;
-
- err_nortc:
-	puv3_rtc_enable(&pdev->dev, 0);
-	release_resource(puv3_rtc_mem);
-
- err_nores:
-	return ret;
-}
-
-#ifdef CONFIG_PM_SLEEP
-static int ticnt_save;
-
-static int puv3_rtc_suspend(struct device *dev)
-{
-	/* save RTAR for anyone using periodic interrupts */
-	ticnt_save = readl(RTC_RTAR);
-	puv3_rtc_enable(dev, 0);
-	return 0;
-}
-
-static int puv3_rtc_resume(struct device *dev)
-{
-	puv3_rtc_enable(dev, 1);
-	writel(ticnt_save, RTC_RTAR);
-	return 0;
-}
-#endif
-
-static SIMPLE_DEV_PM_OPS(puv3_rtc_pm_ops, puv3_rtc_suspend, puv3_rtc_resume);
-
-static struct platform_driver puv3_rtc_driver = {
-	.probe		= puv3_rtc_probe,
-	.remove		= puv3_rtc_remove,
-	.driver		= {
-		.name	= "PKUnity-v3-RTC",
-		.pm	= &puv3_rtc_pm_ops,
-	}
-};
-
-module_platform_driver(puv3_rtc_driver);
-
-MODULE_DESCRIPTION("RTC Driver for the PKUnity v3 chip");
-MODULE_AUTHOR("Hu Dongliang");
-MODULE_LICENSE("GPL v2");

From 3346dd99fb4cd174fdbfb68dc62cd109e4323f0f Mon Sep 17 00:00:00 2001
From: Mike Rapoport <rppt@linux.ibm.com>
Date: Wed, 10 Jun 2020 10:24:55 +0300
Subject: [PATCH 236/502] MAINTAINERS: remove "PKUNITY SOC DRIVERS" entry

There no PkUnity drivers left, so remove the MAINTAINERS entry.

Signed-off-by: Mike Rapoport <rppt@linux.ibm.com>
Acked-by: Arnd Bergmann <arnd@arndb.de>
Acked-by: Guenter Roeck <linux@roeck-us.net>
---
 MAINTAINERS | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/MAINTAINERS b/MAINTAINERS
index e4787ac42153..6f8c204cb60e 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -13580,12 +13580,6 @@ F:	drivers/block/pktcdvd.c
 F:	include/linux/pktcdvd.h
 F:	include/uapi/linux/pktcdvd.h
 
-PKUNITY SOC DRIVERS
-M:	Guan Xuetao <gxt@pku.edu.cn>
-S:	Maintained
-W:	http://mprc.pku.edu.cn/~guanxuetao/linux
-T:	git git://github.com/gxt/linux.git
-
 PLANTOWER PMS7003 AIR POLLUTION SENSOR DRIVER
 M:	Tomasz Duszynski <tduszyns@gmail.com>
 S:	Maintained

From 66a049b764a71dc32031b7b533f98fc0299e6e11 Mon Sep 17 00:00:00 2001
From: Sven Schnelle <svens@linux.ibm.com>
Date: Thu, 25 Jun 2020 21:53:17 +0200
Subject: [PATCH 237/502] s390/stp: allow group and users to read stp sysfs
 files

There are no secrets in these files, so allow all users
to read it.

Signed-off-by: Sven Schnelle <svens@linux.ibm.com>
Acked-by: Heiko Carstens <heiko.carstens@de.ibm.com>
Signed-off-by: Heiko Carstens <heiko.carstens@de.ibm.com>
---
 arch/s390/kernel/time.c | 49 ++++++++++++++++++-----------------------
 1 file changed, 22 insertions(+), 27 deletions(-)

diff --git a/arch/s390/kernel/time.c b/arch/s390/kernel/time.c
index 6bc20861fff9..700127ba689d 100644
--- a/arch/s390/kernel/time.c
+++ b/arch/s390/kernel/time.c
@@ -683,7 +683,7 @@ static struct bus_type stp_subsys = {
 	.dev_name	= "stp",
 };
 
-static ssize_t stp_ctn_id_show(struct device *dev,
+static ssize_t ctn_id_show(struct device *dev,
 				struct device_attribute *attr,
 				char *buf)
 {
@@ -693,9 +693,9 @@ static ssize_t stp_ctn_id_show(struct device *dev,
 		       *(unsigned long long *) stp_info.ctnid);
 }
 
-static DEVICE_ATTR(ctn_id, 0400, stp_ctn_id_show, NULL);
+static DEVICE_ATTR_RO(ctn_id);
 
-static ssize_t stp_ctn_type_show(struct device *dev,
+static ssize_t ctn_type_show(struct device *dev,
 				struct device_attribute *attr,
 				char *buf)
 {
@@ -704,9 +704,9 @@ static ssize_t stp_ctn_type_show(struct device *dev,
 	return sprintf(buf, "%i\n", stp_info.ctn);
 }
 
-static DEVICE_ATTR(ctn_type, 0400, stp_ctn_type_show, NULL);
+static DEVICE_ATTR_RO(ctn_type);
 
-static ssize_t stp_dst_offset_show(struct device *dev,
+static ssize_t dst_offset_show(struct device *dev,
 				   struct device_attribute *attr,
 				   char *buf)
 {
@@ -715,9 +715,9 @@ static ssize_t stp_dst_offset_show(struct device *dev,
 	return sprintf(buf, "%i\n", (int)(s16) stp_info.dsto);
 }
 
-static DEVICE_ATTR(dst_offset, 0400, stp_dst_offset_show, NULL);
+static DEVICE_ATTR_RO(dst_offset);
 
-static ssize_t stp_leap_seconds_show(struct device *dev,
+static ssize_t leap_seconds_show(struct device *dev,
 					struct device_attribute *attr,
 					char *buf)
 {
@@ -726,9 +726,9 @@ static ssize_t stp_leap_seconds_show(struct device *dev,
 	return sprintf(buf, "%i\n", (int)(s16) stp_info.leaps);
 }
 
-static DEVICE_ATTR(leap_seconds, 0400, stp_leap_seconds_show, NULL);
+static DEVICE_ATTR_RO(leap_seconds);
 
-static ssize_t stp_stratum_show(struct device *dev,
+static ssize_t stratum_show(struct device *dev,
 				struct device_attribute *attr,
 				char *buf)
 {
@@ -737,9 +737,9 @@ static ssize_t stp_stratum_show(struct device *dev,
 	return sprintf(buf, "%i\n", (int)(s16) stp_info.stratum);
 }
 
-static DEVICE_ATTR(stratum, 0400, stp_stratum_show, NULL);
+static DEVICE_ATTR_RO(stratum);
 
-static ssize_t stp_time_offset_show(struct device *dev,
+static ssize_t time_offset_show(struct device *dev,
 				struct device_attribute *attr,
 				char *buf)
 {
@@ -748,9 +748,9 @@ static ssize_t stp_time_offset_show(struct device *dev,
 	return sprintf(buf, "%i\n", (int) stp_info.tto);
 }
 
-static DEVICE_ATTR(time_offset, 0400, stp_time_offset_show, NULL);
+static DEVICE_ATTR_RO(time_offset);
 
-static ssize_t stp_time_zone_offset_show(struct device *dev,
+static ssize_t time_zone_offset_show(struct device *dev,
 				struct device_attribute *attr,
 				char *buf)
 {
@@ -759,10 +759,9 @@ static ssize_t stp_time_zone_offset_show(struct device *dev,
 	return sprintf(buf, "%i\n", (int)(s16) stp_info.tzo);
 }
 
-static DEVICE_ATTR(time_zone_offset, 0400,
-			 stp_time_zone_offset_show, NULL);
+static DEVICE_ATTR_RO(time_zone_offset);
 
-static ssize_t stp_timing_mode_show(struct device *dev,
+static ssize_t timing_mode_show(struct device *dev,
 				struct device_attribute *attr,
 				char *buf)
 {
@@ -771,9 +770,9 @@ static ssize_t stp_timing_mode_show(struct device *dev,
 	return sprintf(buf, "%i\n", stp_info.tmd);
 }
 
-static DEVICE_ATTR(timing_mode, 0400, stp_timing_mode_show, NULL);
+static DEVICE_ATTR_RO(timing_mode);
 
-static ssize_t stp_timing_state_show(struct device *dev,
+static ssize_t timing_state_show(struct device *dev,
 				struct device_attribute *attr,
 				char *buf)
 {
@@ -782,16 +781,16 @@ static ssize_t stp_timing_state_show(struct device *dev,
 	return sprintf(buf, "%i\n", stp_info.tst);
 }
 
-static DEVICE_ATTR(timing_state, 0400, stp_timing_state_show, NULL);
+static DEVICE_ATTR_RO(timing_state);
 
-static ssize_t stp_online_show(struct device *dev,
+static ssize_t online_show(struct device *dev,
 				struct device_attribute *attr,
 				char *buf)
 {
 	return sprintf(buf, "%i\n", stp_online);
 }
 
-static ssize_t stp_online_store(struct device *dev,
+static ssize_t online_store(struct device *dev,
 				struct device_attribute *attr,
 				const char *buf, size_t count)
 {
@@ -817,18 +816,14 @@ static ssize_t stp_online_store(struct device *dev,
  * Can't use DEVICE_ATTR because the attribute should be named
  * stp/online but dev_attr_online already exists in this file ..
  */
-static struct device_attribute dev_attr_stp_online = {
-	.attr = { .name = "online", .mode = 0600 },
-	.show	= stp_online_show,
-	.store	= stp_online_store,
-};
+static DEVICE_ATTR_RW(online);
 
 static struct device_attribute *stp_attributes[] = {
 	&dev_attr_ctn_id,
 	&dev_attr_ctn_type,
 	&dev_attr_dst_offset,
 	&dev_attr_leap_seconds,
-	&dev_attr_stp_online,
+	&dev_attr_online,
 	&dev_attr_stratum,
 	&dev_attr_time_offset,
 	&dev_attr_time_zone_offset,

From f05f62d04271faa265c7a4f75638ebc380d182fa Mon Sep 17 00:00:00 2001
From: David Hildenbrand <david@redhat.com>
Date: Thu, 25 Jun 2020 17:00:29 +0200
Subject: [PATCH 238/502] s390/vmem: get rid of memory segment list

I can't come up with a satisfying reason why we still need the memory
segment list. We used to represent in the list:
- boot memory
- standby memory added via add_memory()
- loaded dcss segments

When loading/unloading dcss segments, we already track them in a
separate list and check for overlaps
(arch/s390/mm/extmem.c:segment_overlaps_others()) when loading segments.

The overlap check was introduced for some segments in
commit b2300b9efe1b ("[S390] dcssblk: add >2G DCSSs support and stacked
contiguous DCSSs support.")
and was extended to cover all dcss segments in
commit ca57114609d1 ("s390/extmem: remove code for 31 bit addressing
mode").

Although I doubt that overlaps with boot memory and standby memory
are relevant, let's reshuffle the checks in load_segment() to request
the resource first. This will bail out in case we have overlaps with
other resources (esp. boot memory and standby memory). The order
is now different compared to segment_unload() and segment_unload(), but
that should not matter.

This smells like a leftover from ancient times, let's get rid of it. We
can now convert vmem_remove_mapping() into a void function - everybody
ignored the return value already.

Cc: Vasily Gorbik <gor@linux.ibm.com>
Cc: Christian Borntraeger <borntraeger@de.ibm.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: David Hildenbrand <david@redhat.com>
Message-Id: <20200625150029.45019-1-david@redhat.com>
Reviewed-by: Gerald Schaefer <gerald.schaefer@de.ibm.com>
Tested-by: Gerald Schaefer <gerald.schaefer@de.ibm.com> [DCSS]
Signed-off-by: Heiko Carstens <heiko.carstens@de.ibm.com>
---
 arch/s390/include/asm/pgtable.h |   2 +-
 arch/s390/mm/extmem.c           |  25 +++----
 arch/s390/mm/vmem.c             | 115 ++------------------------------
 3 files changed, 21 insertions(+), 121 deletions(-)

diff --git a/arch/s390/include/asm/pgtable.h b/arch/s390/include/asm/pgtable.h
index 19d603bd1f36..7eb01a5459cd 100644
--- a/arch/s390/include/asm/pgtable.h
+++ b/arch/s390/include/asm/pgtable.h
@@ -1669,7 +1669,7 @@ static inline swp_entry_t __swp_entry(unsigned long type, unsigned long offset)
 #define kern_addr_valid(addr)   (1)
 
 extern int vmem_add_mapping(unsigned long start, unsigned long size);
-extern int vmem_remove_mapping(unsigned long start, unsigned long size);
+extern void vmem_remove_mapping(unsigned long start, unsigned long size);
 extern int s390_enable_sie(void);
 extern int s390_enable_skey(void);
 extern void s390_reset_cmma(struct mm_struct *mm);
diff --git a/arch/s390/mm/extmem.c b/arch/s390/mm/extmem.c
index 9e0aa7aa03ba..105c09282f8c 100644
--- a/arch/s390/mm/extmem.c
+++ b/arch/s390/mm/extmem.c
@@ -313,15 +313,10 @@ __segment_load (char *name, int do_nonshared, unsigned long *addr, unsigned long
 		goto out_free;
 	}
 
-	rc = vmem_add_mapping(seg->start_addr, seg->end - seg->start_addr + 1);
-
-	if (rc)
-		goto out_free;
-
 	seg->res = kzalloc(sizeof(struct resource), GFP_KERNEL);
 	if (seg->res == NULL) {
 		rc = -ENOMEM;
-		goto out_shared;
+		goto out_free;
 	}
 	seg->res->flags = IORESOURCE_BUSY | IORESOURCE_MEM;
 	seg->res->start = seg->start_addr;
@@ -335,12 +330,17 @@ __segment_load (char *name, int do_nonshared, unsigned long *addr, unsigned long
 	if (rc == SEG_TYPE_SC ||
 	    ((rc == SEG_TYPE_SR || rc == SEG_TYPE_ER) && !do_nonshared))
 		seg->res->flags |= IORESOURCE_READONLY;
+
+	/* Check for overlapping resources before adding the mapping. */
 	if (request_resource(&iomem_resource, seg->res)) {
 		rc = -EBUSY;
-		kfree(seg->res);
-		goto out_shared;
+		goto out_free_resource;
 	}
 
+	rc = vmem_add_mapping(seg->start_addr, seg->end - seg->start_addr + 1);
+	if (rc)
+		goto out_resource;
+
 	if (do_nonshared)
 		diag_cc = dcss_diag(&loadnsr_scode, seg->dcss_name,
 				&start_addr, &end_addr);
@@ -351,14 +351,14 @@ __segment_load (char *name, int do_nonshared, unsigned long *addr, unsigned long
 		dcss_diag(&purgeseg_scode, seg->dcss_name,
 				&dummy, &dummy);
 		rc = diag_cc;
-		goto out_resource;
+		goto out_mapping;
 	}
 	if (diag_cc > 1) {
 		pr_warn("Loading DCSS %s failed with rc=%ld\n", name, end_addr);
 		rc = dcss_diag_translate_rc(end_addr);
 		dcss_diag(&purgeseg_scode, seg->dcss_name,
 				&dummy, &dummy);
-		goto out_resource;
+		goto out_mapping;
 	}
 	seg->start_addr = start_addr;
 	seg->end = end_addr;
@@ -377,11 +377,12 @@ __segment_load (char *name, int do_nonshared, unsigned long *addr, unsigned long
 			(void*) seg->end, segtype_string[seg->vm_segtype]);
 	}
 	goto out;
+ out_mapping:
+	vmem_remove_mapping(seg->start_addr, seg->end - seg->start_addr + 1);
  out_resource:
 	release_resource(seg->res);
+ out_free_resource:
 	kfree(seg->res);
- out_shared:
-	vmem_remove_mapping(seg->start_addr, seg->end - seg->start_addr + 1);
  out_free:
 	kfree(seg);
  out:
diff --git a/arch/s390/mm/vmem.c b/arch/s390/mm/vmem.c
index 8b6282cf7d13..3b9e71654c37 100644
--- a/arch/s390/mm/vmem.c
+++ b/arch/s390/mm/vmem.c
@@ -20,14 +20,6 @@
 
 static DEFINE_MUTEX(vmem_mutex);
 
-struct memory_segment {
-	struct list_head list;
-	unsigned long start;
-	unsigned long size;
-};
-
-static LIST_HEAD(mem_segs);
-
 static void __ref *vmem_alloc_pages(unsigned int order)
 {
 	unsigned long size = PAGE_SIZE << order;
@@ -300,94 +292,25 @@ void vmemmap_free(unsigned long start, unsigned long end,
 {
 }
 
-/*
- * Add memory segment to the segment list if it doesn't overlap with
- * an already present segment.
- */
-static int insert_memory_segment(struct memory_segment *seg)
+void vmem_remove_mapping(unsigned long start, unsigned long size)
 {
-	struct memory_segment *tmp;
-
-	if (seg->start + seg->size > VMEM_MAX_PHYS ||
-	    seg->start + seg->size < seg->start)
-		return -ERANGE;
-
-	list_for_each_entry(tmp, &mem_segs, list) {
-		if (seg->start >= tmp->start + tmp->size)
-			continue;
-		if (seg->start + seg->size <= tmp->start)
-			continue;
-		return -ENOSPC;
-	}
-	list_add(&seg->list, &mem_segs);
-	return 0;
-}
-
-/*
- * Remove memory segment from the segment list.
- */
-static void remove_memory_segment(struct memory_segment *seg)
-{
-	list_del(&seg->list);
-}
-
-static void __remove_shared_memory(struct memory_segment *seg)
-{
-	remove_memory_segment(seg);
-	vmem_remove_range(seg->start, seg->size);
-}
-
-int vmem_remove_mapping(unsigned long start, unsigned long size)
-{
-	struct memory_segment *seg;
-	int ret;
-
 	mutex_lock(&vmem_mutex);
-
-	ret = -ENOENT;
-	list_for_each_entry(seg, &mem_segs, list) {
-		if (seg->start == start && seg->size == size)
-			break;
-	}
-
-	if (seg->start != start || seg->size != size)
-		goto out;
-
-	ret = 0;
-	__remove_shared_memory(seg);
-	kfree(seg);
-out:
+	vmem_remove_range(start, size);
 	mutex_unlock(&vmem_mutex);
-	return ret;
 }
 
 int vmem_add_mapping(unsigned long start, unsigned long size)
 {
-	struct memory_segment *seg;
 	int ret;
 
+	if (start + size > VMEM_MAX_PHYS ||
+	    start + size < start)
+		return -ERANGE;
+
 	mutex_lock(&vmem_mutex);
-	ret = -ENOMEM;
-	seg = kzalloc(sizeof(*seg), GFP_KERNEL);
-	if (!seg)
-		goto out;
-	seg->start = start;
-	seg->size = size;
-
-	ret = insert_memory_segment(seg);
-	if (ret)
-		goto out_free;
-
 	ret = vmem_add_mem(start, size);
 	if (ret)
-		goto out_remove;
-	goto out;
-
-out_remove:
-	__remove_shared_memory(seg);
-out_free:
-	kfree(seg);
-out:
+		vmem_remove_range(start, size);
 	mutex_unlock(&vmem_mutex);
 	return ret;
 }
@@ -421,27 +344,3 @@ void __init vmem_map_init(void)
 	pr_info("Write protected kernel read-only data: %luk\n",
 		(unsigned long)(__end_rodata - _stext) >> 10);
 }
-
-/*
- * Convert memblock.memory  to a memory segment list so there is a single
- * list that contains all memory segments.
- */
-static int __init vmem_convert_memory_chunk(void)
-{
-	struct memblock_region *reg;
-	struct memory_segment *seg;
-
-	mutex_lock(&vmem_mutex);
-	for_each_memblock(memory, reg) {
-		seg = kzalloc(sizeof(*seg), GFP_KERNEL);
-		if (!seg)
-			panic("Out of memory...\n");
-		seg->start = reg->base;
-		seg->size = reg->size;
-		insert_memory_segment(seg);
-	}
-	mutex_unlock(&vmem_mutex);
-	return 0;
-}
-
-core_initcall(vmem_convert_memory_chunk);

From 5cdfbdce5de6b5b56e104676409762fc1289a9c2 Mon Sep 17 00:00:00 2001
From: Oscar Carter <oscar.carter@gmx.com>
Date: Sat, 27 Jun 2020 14:54:17 +0200
Subject: [PATCH 239/502] s390/tty3270: remove function callback casts

In an effort to enable -Wcast-function-type in the top-level Makefile to
support Control Flow Integrity builds, remove all the function callback
casts.

To do this modify the function prototypes accordingly.

Signed-off-by: Oscar Carter <oscar.carter@gmx.com>
Message-Id: <20200627125417.18887-1-oscar.carter@gmx.com>
Reviewed-by: Kees Cook <keescook@chromium.org>
[heiko.carstens@de.ibm.com: coding style changes]
Signed-off-by: Heiko Carstens <heiko.carstens@de.ibm.com>
---
 drivers/s390/char/tty3270.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/drivers/s390/char/tty3270.c b/drivers/s390/char/tty3270.c
index 98d7fc152e32..aec996de44d9 100644
--- a/drivers/s390/char/tty3270.c
+++ b/drivers/s390/char/tty3270.c
@@ -556,8 +556,9 @@ tty3270_scroll_backward(struct kbd_data *kbd)
  * Pass input line to tty.
  */
 static void
-tty3270_read_tasklet(struct raw3270_request *rrq)
+tty3270_read_tasklet(unsigned long data)
 {
+	struct raw3270_request *rrq = (struct raw3270_request *)data;
 	static char kreset_data = TW_KR;
 	struct tty3270 *tp = container_of(rrq->view, struct tty3270, view);
 	char *input;
@@ -652,8 +653,9 @@ tty3270_issue_read(struct tty3270 *tp, int lock)
  * Hang up the tty
  */
 static void
-tty3270_hangup_tasklet(struct tty3270 *tp)
+tty3270_hangup_tasklet(unsigned long data)
 {
+	struct tty3270 *tp = (struct tty3270 *)data;
 	tty_port_tty_hangup(&tp->port, true);
 	raw3270_put_view(&tp->view);
 }
@@ -752,11 +754,9 @@ tty3270_alloc_view(void)
 
 	tty_port_init(&tp->port);
 	timer_setup(&tp->timer, tty3270_update, 0);
-	tasklet_init(&tp->readlet,
-		     (void (*)(unsigned long)) tty3270_read_tasklet,
+	tasklet_init(&tp->readlet, tty3270_read_tasklet,
 		     (unsigned long) tp->read);
-	tasklet_init(&tp->hanglet,
-		     (void (*)(unsigned long)) tty3270_hangup_tasklet,
+	tasklet_init(&tp->hanglet, tty3270_hangup_tasklet,
 		     (unsigned long) tp);
 	INIT_WORK(&tp->resize_work, tty3270_resize_work);
 

From d4e0340919fb9190a57e879fb3125c4acce0d9b2 Mon Sep 17 00:00:00 2001
From: Saravana Kannan <saravanak@google.com>
Date: Mon, 22 Jun 2020 18:18:02 -0700
Subject: [PATCH 240/502] arm64/module: Optimize module load time by optimizing
 PLT counting

When loading a module, module_frob_arch_sections() tries to figure out
the number of PLTs that'll be needed to handle all the RELAs. While
doing this, it tries to dedupe PLT allocations for multiple
R_AARCH64_CALL26 relocations to the same symbol. It does the same for
R_AARCH64_JUMP26 relocations.

To make checks for duplicates easier/faster, it sorts the relocation
list by type, symbol and addend. That way, to check for a duplicate
relocation, it just needs to compare with the previous entry.

However, sorting the entire relocation array is unnecessary and
expensive (O(n log n)) because there are a lot of other relocation types
that don't need deduping or can't be deduped.

So this commit partitions the array into entries that need deduping and
those that don't. And then sorts just the part that needs deduping. And
when CONFIG_RANDOMIZE_BASE is disabled, the sorting is skipped entirely
because PLTs are not allocated for R_AARCH64_CALL26 and R_AARCH64_JUMP26
if it's disabled.

This gives significant reduction in module load time for modules with
large number of relocations with no measurable impact on modules with a
small number of relocations. In my test setup with CONFIG_RANDOMIZE_BASE
enabled, these were the results for a few downstream modules:

Module		Size (MB)
wlan		14
video codec	3.8
drm		1.8
IPA		2.5
audio		1.2
gpu		1.8

Without this patch:
Module		Number of entries sorted	Module load time (ms)
wlan		243739				283
video codec	74029				138
drm		53837				67
IPA		42800				90
audio		21326				27
gpu		20967				32

Total time to load all these module: 637 ms

With this patch:
Module		Number of entries sorted	Module load time (ms)
wlan		22454				61
video codec	10150				47
drm		13014				40
IPA		8097				63
audio		4606				16
gpu		6527				20

Total time to load all these modules: 247

Time saved during boot for just these 6 modules: 390 ms

Signed-off-by: Saravana Kannan <saravanak@google.com>
Acked-by: Will Deacon <will@kernel.org>
Cc: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Link: https://lore.kernel.org/r/20200623011803.91232-1-saravanak@google.com
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 arch/arm64/kernel/module-plts.c | 46 ++++++++++++++++++++++++++++++---
 1 file changed, 43 insertions(+), 3 deletions(-)

diff --git a/arch/arm64/kernel/module-plts.c b/arch/arm64/kernel/module-plts.c
index 65b08a74aec6..0ce3a28e3347 100644
--- a/arch/arm64/kernel/module-plts.c
+++ b/arch/arm64/kernel/module-plts.c
@@ -253,6 +253,40 @@ static unsigned int count_plts(Elf64_Sym *syms, Elf64_Rela *rela, int num,
 	return ret;
 }
 
+static bool branch_rela_needs_plt(Elf64_Sym *syms, Elf64_Rela *rela,
+				  Elf64_Word dstidx)
+{
+
+	Elf64_Sym *s = syms + ELF64_R_SYM(rela->r_info);
+
+	if (s->st_shndx == dstidx)
+		return false;
+
+	return ELF64_R_TYPE(rela->r_info) == R_AARCH64_JUMP26 ||
+	       ELF64_R_TYPE(rela->r_info) == R_AARCH64_CALL26;
+}
+
+/* Group branch PLT relas at the front end of the array. */
+static int partition_branch_plt_relas(Elf64_Sym *syms, Elf64_Rela *rela,
+				      int numrels, Elf64_Word dstidx)
+{
+	int i = 0, j = numrels - 1;
+
+	if (!IS_ENABLED(CONFIG_RANDOMIZE_BASE))
+		return 0;
+
+	while (i < j) {
+		if (branch_rela_needs_plt(syms, &rela[i], dstidx))
+			i++;
+		else if (branch_rela_needs_plt(syms, &rela[j], dstidx))
+			swap(rela[i], rela[j]);
+		else
+			j--;
+	}
+
+	return i;
+}
+
 int module_frob_arch_sections(Elf_Ehdr *ehdr, Elf_Shdr *sechdrs,
 			      char *secstrings, struct module *mod)
 {
@@ -290,7 +324,7 @@ int module_frob_arch_sections(Elf_Ehdr *ehdr, Elf_Shdr *sechdrs,
 
 	for (i = 0; i < ehdr->e_shnum; i++) {
 		Elf64_Rela *rels = (void *)ehdr + sechdrs[i].sh_offset;
-		int numrels = sechdrs[i].sh_size / sizeof(Elf64_Rela);
+		int nents, numrels = sechdrs[i].sh_size / sizeof(Elf64_Rela);
 		Elf64_Shdr *dstsec = sechdrs + sechdrs[i].sh_info;
 
 		if (sechdrs[i].sh_type != SHT_RELA)
@@ -300,8 +334,14 @@ int module_frob_arch_sections(Elf_Ehdr *ehdr, Elf_Shdr *sechdrs,
 		if (!(dstsec->sh_flags & SHF_EXECINSTR))
 			continue;
 
-		/* sort by type, symbol index and addend */
-		sort(rels, numrels, sizeof(Elf64_Rela), cmp_rela, NULL);
+		/*
+		 * sort branch relocations requiring a PLT by type, symbol index
+		 * and addend
+		 */
+		nents = partition_branch_plt_relas(syms, rels, numrels,
+						   sechdrs[i].sh_info);
+		if (nents)
+			sort(rels, nents, sizeof(Elf64_Rela), cmp_rela, NULL);
 
 		if (!str_has_prefix(secstrings + dstsec->sh_name, ".init"))
 			core_plts += count_plts(syms, rels, numrels,

From 3cb9d5464c1ceea86f6225089b2f7965989cf316 Mon Sep 17 00:00:00 2001
From: Wei Wang <wei.w.wang@intel.com>
Date: Sat, 13 Jun 2020 16:09:46 +0800
Subject: [PATCH 241/502] perf/x86: Fix variable types for LBR registers

The MSR variable type can be 'unsigned int', which uses less memory than
the longer 'unsigned long'. Fix 'struct x86_pmu' for that. The lbr_nr won't
be a negative number, so make it 'unsigned int' as well.

Suggested-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Wei Wang <wei.w.wang@intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/20200613080958.132489-2-like.xu@linux.intel.com
---
 arch/x86/events/perf_event.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h
index e17a3d8a47ed..eb37f6c43c96 100644
--- a/arch/x86/events/perf_event.h
+++ b/arch/x86/events/perf_event.h
@@ -673,8 +673,8 @@ struct x86_pmu {
 	/*
 	 * Intel LBR
 	 */
-	unsigned long	lbr_tos, lbr_from, lbr_to; /* MSR base regs       */
-	int		lbr_nr;			   /* hardware stack size */
+	unsigned int	lbr_tos, lbr_from, lbr_to,
+			lbr_nr;			   /* LBR base regs and size */
 	u64		lbr_sel_mask;		   /* LBR_SELECT valid bits */
 	const int	*lbr_sel_map;		   /* lbr_select mappings */
 	bool		lbr_double_abort;	   /* duplicated lbr aborts */

From 027440b5d426a51f33b515bbd236cc479d1e051f Mon Sep 17 00:00:00 2001
From: Like Xu <like.xu@linux.intel.com>
Date: Sat, 13 Jun 2020 16:09:47 +0800
Subject: [PATCH 242/502] perf/x86/core: Refactor hw->idx checks and cleanup

For intel_pmu_en/disable_event(), reorder the branches checks for hw->idx
and make them sorted by probability: gp,fixed,bts,others.

Clean up the x86_assign_hw_event() by converting multiple if-else
statements to a switch statement.

To skip x86_perf_event_update() and x86_perf_event_set_period(),
it's generic to replace "idx == INTEL_PMC_IDX_FIXED_BTS" check with
'!hwc->event_base' because that should be 0 for all non-gp/fixed cases.

Wrap related bit operations into intel_set/clear_masks() and make the main
path more cleaner and readable.

No functional changes.

Signed-off-by: Like Xu <like.xu@linux.intel.com>
Original-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/20200613080958.132489-3-like.xu@linux.intel.com
---
 arch/x86/events/core.c       | 25 +++++++----
 arch/x86/events/intel/core.c | 85 +++++++++++++++++++-----------------
 2 files changed, 62 insertions(+), 48 deletions(-)

diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c
index 4103665c6e03..15cb7af7db18 100644
--- a/arch/x86/events/core.c
+++ b/arch/x86/events/core.c
@@ -71,10 +71,9 @@ u64 x86_perf_event_update(struct perf_event *event)
 	struct hw_perf_event *hwc = &event->hw;
 	int shift = 64 - x86_pmu.cntval_bits;
 	u64 prev_raw_count, new_raw_count;
-	int idx = hwc->idx;
 	u64 delta;
 
-	if (idx == INTEL_PMC_IDX_FIXED_BTS)
+	if (unlikely(!hwc->event_base))
 		return 0;
 
 	/*
@@ -1097,22 +1096,30 @@ static inline void x86_assign_hw_event(struct perf_event *event,
 				struct cpu_hw_events *cpuc, int i)
 {
 	struct hw_perf_event *hwc = &event->hw;
+	int idx;
 
-	hwc->idx = cpuc->assign[i];
+	idx = hwc->idx = cpuc->assign[i];
 	hwc->last_cpu = smp_processor_id();
 	hwc->last_tag = ++cpuc->tags[i];
 
-	if (hwc->idx == INTEL_PMC_IDX_FIXED_BTS) {
+	switch (hwc->idx) {
+	case INTEL_PMC_IDX_FIXED_BTS:
 		hwc->config_base = 0;
 		hwc->event_base	= 0;
-	} else if (hwc->idx >= INTEL_PMC_IDX_FIXED) {
+		break;
+
+	case INTEL_PMC_IDX_FIXED ... INTEL_PMC_IDX_FIXED_BTS-1:
 		hwc->config_base = MSR_ARCH_PERFMON_FIXED_CTR_CTRL;
-		hwc->event_base = MSR_ARCH_PERFMON_FIXED_CTR0 + (hwc->idx - INTEL_PMC_IDX_FIXED);
-		hwc->event_base_rdpmc = (hwc->idx - INTEL_PMC_IDX_FIXED) | 1<<30;
-	} else {
+		hwc->event_base = MSR_ARCH_PERFMON_FIXED_CTR0 +
+				(idx - INTEL_PMC_IDX_FIXED);
+		hwc->event_base_rdpmc = (idx - INTEL_PMC_IDX_FIXED) | 1<<30;
+		break;
+
+	default:
 		hwc->config_base = x86_pmu_config_addr(hwc->idx);
 		hwc->event_base  = x86_pmu_event_addr(hwc->idx);
 		hwc->event_base_rdpmc = x86_pmu_rdpmc_index(hwc->idx);
+		break;
 	}
 }
 
@@ -1233,7 +1240,7 @@ int x86_perf_event_set_period(struct perf_event *event)
 	s64 period = hwc->sample_period;
 	int ret = 0, idx = hwc->idx;
 
-	if (idx == INTEL_PMC_IDX_FIXED_BTS)
+	if (unlikely(!hwc->event_base))
 		return 0;
 
 	/*
diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c
index ca35c8b5ee10..8dac4c61bf76 100644
--- a/arch/x86/events/intel/core.c
+++ b/arch/x86/events/intel/core.c
@@ -2136,8 +2136,35 @@ static inline void intel_pmu_ack_status(u64 ack)
 	wrmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, ack);
 }
 
-static void intel_pmu_disable_fixed(struct hw_perf_event *hwc)
+static inline bool event_is_checkpointed(struct perf_event *event)
 {
+	return unlikely(event->hw.config & HSW_IN_TX_CHECKPOINTED) != 0;
+}
+
+static inline void intel_set_masks(struct perf_event *event, int idx)
+{
+	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+
+	if (event->attr.exclude_host)
+		__set_bit(idx, (unsigned long *)&cpuc->intel_ctrl_guest_mask);
+	if (event->attr.exclude_guest)
+		__set_bit(idx, (unsigned long *)&cpuc->intel_ctrl_host_mask);
+	if (event_is_checkpointed(event))
+		__set_bit(idx, (unsigned long *)&cpuc->intel_cp_status);
+}
+
+static inline void intel_clear_masks(struct perf_event *event, int idx)
+{
+	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+
+	__clear_bit(idx, (unsigned long *)&cpuc->intel_ctrl_guest_mask);
+	__clear_bit(idx, (unsigned long *)&cpuc->intel_ctrl_host_mask);
+	__clear_bit(idx, (unsigned long *)&cpuc->intel_cp_status);
+}
+
+static void intel_pmu_disable_fixed(struct perf_event *event)
+{
+	struct hw_perf_event *hwc = &event->hw;
 	int idx = hwc->idx - INTEL_PMC_IDX_FIXED;
 	u64 ctrl_val, mask;
 
@@ -2148,31 +2175,22 @@ static void intel_pmu_disable_fixed(struct hw_perf_event *hwc)
 	wrmsrl(hwc->config_base, ctrl_val);
 }
 
-static inline bool event_is_checkpointed(struct perf_event *event)
-{
-	return (event->hw.config & HSW_IN_TX_CHECKPOINTED) != 0;
-}
-
 static void intel_pmu_disable_event(struct perf_event *event)
 {
 	struct hw_perf_event *hwc = &event->hw;
-	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+	int idx = hwc->idx;
 
-	if (unlikely(hwc->idx == INTEL_PMC_IDX_FIXED_BTS)) {
+	if (idx < INTEL_PMC_IDX_FIXED) {
+		intel_clear_masks(event, idx);
+		x86_pmu_disable_event(event);
+	} else if (idx < INTEL_PMC_IDX_FIXED_BTS) {
+		intel_clear_masks(event, idx);
+		intel_pmu_disable_fixed(event);
+	} else if (idx == INTEL_PMC_IDX_FIXED_BTS) {
 		intel_pmu_disable_bts();
 		intel_pmu_drain_bts_buffer();
-		return;
 	}
 
-	cpuc->intel_ctrl_guest_mask &= ~(1ull << hwc->idx);
-	cpuc->intel_ctrl_host_mask &= ~(1ull << hwc->idx);
-	cpuc->intel_cp_status &= ~(1ull << hwc->idx);
-
-	if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL))
-		intel_pmu_disable_fixed(hwc);
-	else
-		x86_pmu_disable_event(event);
-
 	/*
 	 * Needs to be called after x86_pmu_disable_event,
 	 * so we don't trigger the event without PEBS bit set.
@@ -2238,33 +2256,22 @@ static void intel_pmu_enable_fixed(struct perf_event *event)
 static void intel_pmu_enable_event(struct perf_event *event)
 {
 	struct hw_perf_event *hwc = &event->hw;
-	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
-
-	if (unlikely(hwc->idx == INTEL_PMC_IDX_FIXED_BTS)) {
-		if (!__this_cpu_read(cpu_hw_events.enabled))
-			return;
-
-		intel_pmu_enable_bts(hwc->config);
-		return;
-	}
-
-	if (event->attr.exclude_host)
-		cpuc->intel_ctrl_guest_mask |= (1ull << hwc->idx);
-	if (event->attr.exclude_guest)
-		cpuc->intel_ctrl_host_mask |= (1ull << hwc->idx);
-
-	if (unlikely(event_is_checkpointed(event)))
-		cpuc->intel_cp_status |= (1ull << hwc->idx);
+	int idx = hwc->idx;
 
 	if (unlikely(event->attr.precise_ip))
 		intel_pmu_pebs_enable(event);
 
-	if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) {
+	if (idx < INTEL_PMC_IDX_FIXED) {
+		intel_set_masks(event, idx);
+		__x86_pmu_enable_event(hwc, ARCH_PERFMON_EVENTSEL_ENABLE);
+	} else if (idx < INTEL_PMC_IDX_FIXED_BTS) {
+		intel_set_masks(event, idx);
 		intel_pmu_enable_fixed(event);
-		return;
+	} else if (idx == INTEL_PMC_IDX_FIXED_BTS) {
+		if (!__this_cpu_read(cpu_hw_events.enabled))
+			return;
+		intel_pmu_enable_bts(hwc->config);
 	}
-
-	__x86_pmu_enable_event(hwc, ARCH_PERFMON_EVENTSEL_ENABLE);
 }
 
 static void intel_pmu_add_event(struct perf_event *event)

From b2d6504761a50b9493eb4b20f6e188b673f20c32 Mon Sep 17 00:00:00 2001
From: Like Xu <like.xu@linux.intel.com>
Date: Sat, 13 Jun 2020 16:09:48 +0800
Subject: [PATCH 243/502] perf/x86/lbr: Add interface to get LBR information

The LBR records msrs are model specific. The perf subsystem has already
obtained the base addresses of LBR records based on the cpu model.

Therefore, an interface is added to allow callers outside the perf
subsystem to obtain these LBR information. It's useful for hypervisors
to emulate the LBR feature for guests with less code.

Signed-off-by: Like Xu <like.xu@linux.intel.com>
Signed-off-by: Wei Wang <wei.w.wang@intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/20200613080958.132489-4-like.xu@linux.intel.com
---
 arch/x86/events/intel/lbr.c       | 20 ++++++++++++++++++++
 arch/x86/include/asm/perf_event.h | 12 ++++++++++++
 2 files changed, 32 insertions(+)

diff --git a/arch/x86/events/intel/lbr.c b/arch/x86/events/intel/lbr.c
index 65113b16804a..2ed3f2a51bdf 100644
--- a/arch/x86/events/intel/lbr.c
+++ b/arch/x86/events/intel/lbr.c
@@ -1343,3 +1343,23 @@ void intel_pmu_lbr_init_knl(void)
 	if (x86_pmu.intel_cap.lbr_format == LBR_FORMAT_LIP)
 		x86_pmu.intel_cap.lbr_format = LBR_FORMAT_EIP_FLAGS;
 }
+
+/**
+ * x86_perf_get_lbr - get the LBR records information
+ *
+ * @lbr: the caller's memory to store the LBR records information
+ *
+ * Returns: 0 indicates the LBR info has been successfully obtained
+ */
+int x86_perf_get_lbr(struct x86_pmu_lbr *lbr)
+{
+	int lbr_fmt = x86_pmu.intel_cap.lbr_format;
+
+	lbr->nr = x86_pmu.lbr_nr;
+	lbr->from = x86_pmu.lbr_from;
+	lbr->to = x86_pmu.lbr_to;
+	lbr->info = (lbr_fmt == LBR_FORMAT_INFO) ? MSR_LBR_INFO_0 : 0;
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(x86_perf_get_lbr);
diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h
index e855e9cf2c37..5d2c30f0df02 100644
--- a/arch/x86/include/asm/perf_event.h
+++ b/arch/x86/include/asm/perf_event.h
@@ -333,6 +333,13 @@ struct perf_guest_switch_msr {
 	u64 host, guest;
 };
 
+struct x86_pmu_lbr {
+	unsigned int	nr;
+	unsigned int	from;
+	unsigned int	to;
+	unsigned int	info;
+};
+
 extern void perf_get_x86_pmu_capability(struct x86_pmu_capability *cap);
 extern void perf_check_microcode(void);
 extern int x86_perf_rdpmc_index(struct perf_event *event);
@@ -348,12 +355,17 @@ static inline void perf_check_microcode(void) { }
 
 #if defined(CONFIG_PERF_EVENTS) && defined(CONFIG_CPU_SUP_INTEL)
 extern struct perf_guest_switch_msr *perf_guest_get_msrs(int *nr);
+extern int x86_perf_get_lbr(struct x86_pmu_lbr *lbr);
 #else
 static inline struct perf_guest_switch_msr *perf_guest_get_msrs(int *nr)
 {
 	*nr = 0;
 	return NULL;
 }
+static inline int x86_perf_get_lbr(struct x86_pmu_lbr *lbr)
+{
+	return -1;
+}
 #endif
 
 #ifdef CONFIG_CPU_SUP_INTEL

From 097e4311cda952dfb047f2a49d35aa5de500d474 Mon Sep 17 00:00:00 2001
From: Like Xu <like.xu@linux.intel.com>
Date: Sat, 13 Jun 2020 16:09:49 +0800
Subject: [PATCH 244/502] perf/x86: Add constraint to create guest LBR event
 without hw counter

The hypervisor may request the perf subsystem to schedule a time window
to directly access the LBR records msrs for its own use. Normally, it would
create a guest LBR event with callstack mode enabled, which is scheduled
along with other ordinary LBR events on the host but in an exclusive way.

To avoid wasting a counter for the guest LBR event, the perf tracks its
hw->idx via INTEL_PMC_IDX_FIXED_VLBR and assigns it with a fake VLBR
counter with the help of new vlbr_constraint. As with the BTS event,
there is actually no hardware counter assigned for the guest LBR event.

Signed-off-by: Like Xu <like.xu@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/20200514083054.62538-5-like.xu@linux.intel.com
---
 arch/x86/events/core.c            |  1 +
 arch/x86/events/intel/core.c      | 18 ++++++++++++++++++
 arch/x86/events/intel/lbr.c       |  4 ++++
 arch/x86/events/perf_event.h      |  1 +
 arch/x86/include/asm/perf_event.h | 22 +++++++++++++++++++++-
 5 files changed, 45 insertions(+), 1 deletion(-)

diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c
index 15cb7af7db18..d740c861724c 100644
--- a/arch/x86/events/core.c
+++ b/arch/x86/events/core.c
@@ -1104,6 +1104,7 @@ static inline void x86_assign_hw_event(struct perf_event *event,
 
 	switch (hwc->idx) {
 	case INTEL_PMC_IDX_FIXED_BTS:
+	case INTEL_PMC_IDX_FIXED_VLBR:
 		hwc->config_base = 0;
 		hwc->event_base	= 0;
 		break;
diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c
index 8dac4c61bf76..51e1fba7b1d1 100644
--- a/arch/x86/events/intel/core.c
+++ b/arch/x86/events/intel/core.c
@@ -2621,6 +2621,20 @@ intel_bts_constraints(struct perf_event *event)
 	return NULL;
 }
 
+/*
+ * Note: matches a fake event, like Fixed2.
+ */
+static struct event_constraint *
+intel_vlbr_constraints(struct perf_event *event)
+{
+	struct event_constraint *c = &vlbr_constraint;
+
+	if (unlikely(constraint_match(c, event->hw.config)))
+		return c;
+
+	return NULL;
+}
+
 static int intel_alt_er(int idx, u64 config)
 {
 	int alt_idx = idx;
@@ -2811,6 +2825,10 @@ __intel_get_event_constraints(struct cpu_hw_events *cpuc, int idx,
 {
 	struct event_constraint *c;
 
+	c = intel_vlbr_constraints(event);
+	if (c)
+		return c;
+
 	c = intel_bts_constraints(event);
 	if (c)
 		return c;
diff --git a/arch/x86/events/intel/lbr.c b/arch/x86/events/intel/lbr.c
index 2ed3f2a51bdf..d285d26c1578 100644
--- a/arch/x86/events/intel/lbr.c
+++ b/arch/x86/events/intel/lbr.c
@@ -1363,3 +1363,7 @@ int x86_perf_get_lbr(struct x86_pmu_lbr *lbr)
 	return 0;
 }
 EXPORT_SYMBOL_GPL(x86_perf_get_lbr);
+
+struct event_constraint vlbr_constraint =
+	FIXED_EVENT_CONSTRAINT(INTEL_FIXED_VLBR_EVENT,
+			       (INTEL_PMC_IDX_FIXED_VLBR - INTEL_PMC_IDX_FIXED));
diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h
index eb37f6c43c96..77a6dd66bd9a 100644
--- a/arch/x86/events/perf_event.h
+++ b/arch/x86/events/perf_event.h
@@ -990,6 +990,7 @@ void release_ds_buffers(void);
 void reserve_ds_buffers(void);
 
 extern struct event_constraint bts_constraint;
+extern struct event_constraint vlbr_constraint;
 
 void intel_pmu_enable_bts(u64 config);
 
diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h
index 5d2c30f0df02..2df707311d17 100644
--- a/arch/x86/include/asm/perf_event.h
+++ b/arch/x86/include/asm/perf_event.h
@@ -192,9 +192,29 @@ struct x86_pmu_capability {
 #define GLOBAL_STATUS_UNC_OVF				BIT_ULL(61)
 #define GLOBAL_STATUS_ASIF				BIT_ULL(60)
 #define GLOBAL_STATUS_COUNTERS_FROZEN			BIT_ULL(59)
-#define GLOBAL_STATUS_LBRS_FROZEN			BIT_ULL(58)
+#define GLOBAL_STATUS_LBRS_FROZEN_BIT			58
+#define GLOBAL_STATUS_LBRS_FROZEN			BIT_ULL(GLOBAL_STATUS_LBRS_FROZEN_BIT)
 #define GLOBAL_STATUS_TRACE_TOPAPMI			BIT_ULL(55)
 
+/*
+ * We model guest LBR event tracing as another fixed-mode PMC like BTS.
+ *
+ * We choose bit 58 because it's used to indicate LBR stack frozen state
+ * for architectural perfmon v4, also we unconditionally mask that bit in
+ * the handle_pmi_common(), so it'll never be set in the overflow handling.
+ *
+ * With this fake counter assigned, the guest LBR event user (such as KVM),
+ * can program the LBR registers on its own, and we don't actually do anything
+ * with then in the host context.
+ */
+#define INTEL_PMC_IDX_FIXED_VLBR	(GLOBAL_STATUS_LBRS_FROZEN_BIT)
+
+/*
+ * Pseudo-encoding the guest LBR event as event=0x00,umask=0x1b,
+ * since it would claim bit 58 which is effectively Fixed26.
+ */
+#define INTEL_FIXED_VLBR_EVENT	0x1b00
+
 /*
  * Adaptive PEBS v4
  */

From e1ad1ac2deb8f90af9f12ff316989dd5675dec11 Mon Sep 17 00:00:00 2001
From: Like Xu <like.xu@linux.intel.com>
Date: Sat, 13 Jun 2020 16:09:50 +0800
Subject: [PATCH 245/502] perf/x86: Keep LBR records unchanged in host context
 for guest usage

When a guest wants to use the LBR registers, its hypervisor creates a guest
LBR event and let host perf schedules it. The LBR records msrs are
accessible to the guest when its guest LBR event is scheduled on
by the perf subsystem.

Before scheduling this event out, we should avoid host changes on
IA32_DEBUGCTLMSR or LBR_SELECT. Otherwise, some unexpected branch
operations may interfere with guest behavior, pollute LBR records, and even
cause host branches leakage. In addition, the read operation
on host is also avoidable.

To ensure that guest LBR records are not lost during the context switch,
the guest LBR event would enable the callstack mode which could
save/restore guest unread LBR records with the help of
intel_pmu_lbr_sched_task() naturally.

However, the guest LBR_SELECT may changes for its own use and the host
LBR event doesn't save/restore it. To ensure that we doesn't lost the guest
LBR_SELECT value when the guest LBR event is running, the vlbr_constraint
is bound up with a new constraint flag PERF_X86_EVENT_LBR_SELECT.

Signed-off-by: Like Xu <like.xu@linux.intel.com>
Signed-off-by: Wei Wang <wei.w.wang@intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/20200514083054.62538-6-like.xu@linux.intel.com
---
 arch/x86/events/intel/core.c |  6 ++++--
 arch/x86/events/intel/lbr.c  | 31 ++++++++++++++++++++++++++-----
 arch/x86/events/perf_event.h |  3 +++
 3 files changed, 33 insertions(+), 7 deletions(-)

diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c
index 51e1fba7b1d1..582ddff9a359 100644
--- a/arch/x86/events/intel/core.c
+++ b/arch/x86/events/intel/core.c
@@ -2189,7 +2189,8 @@ static void intel_pmu_disable_event(struct perf_event *event)
 	} else if (idx == INTEL_PMC_IDX_FIXED_BTS) {
 		intel_pmu_disable_bts();
 		intel_pmu_drain_bts_buffer();
-	}
+	} else if (idx == INTEL_PMC_IDX_FIXED_VLBR)
+		intel_clear_masks(event, idx);
 
 	/*
 	 * Needs to be called after x86_pmu_disable_event,
@@ -2271,7 +2272,8 @@ static void intel_pmu_enable_event(struct perf_event *event)
 		if (!__this_cpu_read(cpu_hw_events.enabled))
 			return;
 		intel_pmu_enable_bts(hwc->config);
-	}
+	} else if (idx == INTEL_PMC_IDX_FIXED_VLBR)
+		intel_set_masks(event, idx);
 }
 
 static void intel_pmu_add_event(struct perf_event *event)
diff --git a/arch/x86/events/intel/lbr.c b/arch/x86/events/intel/lbr.c
index d285d26c1578..d03de7539957 100644
--- a/arch/x86/events/intel/lbr.c
+++ b/arch/x86/events/intel/lbr.c
@@ -383,6 +383,9 @@ static void __intel_pmu_lbr_restore(struct x86_perf_task_context *task_ctx)
 
 	wrmsrl(x86_pmu.lbr_tos, tos);
 	task_ctx->lbr_stack_state = LBR_NONE;
+
+	if (cpuc->lbr_select)
+		wrmsrl(MSR_LBR_SELECT, task_ctx->lbr_sel);
 }
 
 static void __intel_pmu_lbr_save(struct x86_perf_task_context *task_ctx)
@@ -415,6 +418,9 @@ static void __intel_pmu_lbr_save(struct x86_perf_task_context *task_ctx)
 
 	cpuc->last_task_ctx = task_ctx;
 	cpuc->last_log_id = ++task_ctx->log_id;
+
+	if (cpuc->lbr_select)
+		rdmsrl(MSR_LBR_SELECT, task_ctx->lbr_sel);
 }
 
 void intel_pmu_lbr_swap_task_ctx(struct perf_event_context *prev,
@@ -485,6 +491,9 @@ void intel_pmu_lbr_add(struct perf_event *event)
 	if (!x86_pmu.lbr_nr)
 		return;
 
+	if (event->hw.flags & PERF_X86_EVENT_LBR_SELECT)
+		cpuc->lbr_select = 1;
+
 	cpuc->br_sel = event->hw.branch_reg.reg;
 
 	if (branch_user_callstack(cpuc->br_sel) && event->ctx->task_ctx_data) {
@@ -532,6 +541,9 @@ void intel_pmu_lbr_del(struct perf_event *event)
 		task_ctx->lbr_callstack_users--;
 	}
 
+	if (event->hw.flags & PERF_X86_EVENT_LBR_SELECT)
+		cpuc->lbr_select = 0;
+
 	if (x86_pmu.intel_cap.pebs_baseline && event->attr.precise_ip > 0)
 		cpuc->lbr_pebs_users--;
 	cpuc->lbr_users--;
@@ -540,11 +552,19 @@ void intel_pmu_lbr_del(struct perf_event *event)
 	perf_sched_cb_dec(event->ctx->pmu);
 }
 
+static inline bool vlbr_exclude_host(void)
+{
+	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+
+	return test_bit(INTEL_PMC_IDX_FIXED_VLBR,
+		(unsigned long *)&cpuc->intel_ctrl_guest_mask);
+}
+
 void intel_pmu_lbr_enable_all(bool pmi)
 {
 	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
 
-	if (cpuc->lbr_users)
+	if (cpuc->lbr_users && !vlbr_exclude_host())
 		__intel_pmu_lbr_enable(pmi);
 }
 
@@ -552,7 +572,7 @@ void intel_pmu_lbr_disable_all(void)
 {
 	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
 
-	if (cpuc->lbr_users)
+	if (cpuc->lbr_users && !vlbr_exclude_host())
 		__intel_pmu_lbr_disable();
 }
 
@@ -694,7 +714,8 @@ void intel_pmu_lbr_read(void)
 	 * This could be smarter and actually check the event,
 	 * but this simple approach seems to work for now.
 	 */
-	if (!cpuc->lbr_users || cpuc->lbr_users == cpuc->lbr_pebs_users)
+	if (!cpuc->lbr_users || vlbr_exclude_host() ||
+	    cpuc->lbr_users == cpuc->lbr_pebs_users)
 		return;
 
 	if (x86_pmu.intel_cap.lbr_format == LBR_FORMAT_32)
@@ -1365,5 +1386,5 @@ int x86_perf_get_lbr(struct x86_pmu_lbr *lbr)
 EXPORT_SYMBOL_GPL(x86_perf_get_lbr);
 
 struct event_constraint vlbr_constraint =
-	FIXED_EVENT_CONSTRAINT(INTEL_FIXED_VLBR_EVENT,
-			       (INTEL_PMC_IDX_FIXED_VLBR - INTEL_PMC_IDX_FIXED));
+	__EVENT_CONSTRAINT(INTEL_FIXED_VLBR_EVENT, (1ULL << INTEL_PMC_IDX_FIXED_VLBR),
+			  FIXED_EVENT_FLAGS, 1, 0, PERF_X86_EVENT_LBR_SELECT);
diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h
index 77a6dd66bd9a..81475963df99 100644
--- a/arch/x86/events/perf_event.h
+++ b/arch/x86/events/perf_event.h
@@ -78,6 +78,7 @@ static inline bool constraint_match(struct event_constraint *c, u64 ecode)
 #define PERF_X86_EVENT_LARGE_PEBS	0x0400 /* use large PEBS */
 #define PERF_X86_EVENT_PEBS_VIA_PT	0x0800 /* use PT buffer for PEBS */
 #define PERF_X86_EVENT_PAIR		0x1000 /* Large Increment per Cycle */
+#define PERF_X86_EVENT_LBR_SELECT	0x2000 /* Save/Restore MSR_LBR_SELECT */
 
 struct amd_nb {
 	int nb_id;  /* NorthBridge id */
@@ -237,6 +238,7 @@ struct cpu_hw_events {
 	u64				br_sel;
 	struct x86_perf_task_context	*last_task_ctx;
 	int				last_log_id;
+	int				lbr_select;
 
 	/*
 	 * Intel host/guest exclude bits
@@ -722,6 +724,7 @@ struct x86_perf_task_context {
 	u64 lbr_from[MAX_LBR_ENTRIES];
 	u64 lbr_to[MAX_LBR_ENTRIES];
 	u64 lbr_info[MAX_LBR_ENTRIES];
+	u64 lbr_sel;
 	int tos;
 	int valid_lbrs;
 	int lbr_callstack_users;

From 638d503130098e234b002942b33a4d886ef6f270 Mon Sep 17 00:00:00 2001
From: Anshuman Khandual <anshuman.khandual@arm.com>
Date: Mon, 29 Jun 2020 10:08:31 +0530
Subject: [PATCH 246/502] arm64/panic: Unify all three existing notifier blocks

Currently there are three different registered panic notifier blocks. This
unifies all of them into a single one i.e arm64_panic_block, hence reducing
code duplication and required calling sequence during panic. This preserves
the existing dump sequence. While here, just use device_initcall() directly
instead of __initcall() which has been a legacy alias for the earlier. This
replacement is a pure cleanup with no functional implications.

Signed-off-by: Anshuman Khandual <anshuman.khandual@arm.com>
Acked-by: Mark Rutland <mark.rutland@arm.com>
Cc: Will Deacon <will@kernel.org>
Cc: Steve Capper <steve.capper@arm.com>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: linux-arm-kernel@lists.infradead.org
Cc: linux-kernel@vger.kernel.org
Link: https://lore.kernel.org/r/1593405511-7625-1-git-send-email-anshuman.khandual@arm.com
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 arch/arm64/include/asm/cpufeature.h |  1 +
 arch/arm64/include/asm/memory.h     |  1 +
 arch/arm64/kernel/cpufeature.c      | 15 +--------------
 arch/arm64/kernel/setup.c           | 24 ++++++++++++++----------
 arch/arm64/mm/init.c                | 18 +-----------------
 5 files changed, 18 insertions(+), 41 deletions(-)

diff --git a/arch/arm64/include/asm/cpufeature.h b/arch/arm64/include/asm/cpufeature.h
index 5d1f4ae42799..e375529ca9fc 100644
--- a/arch/arm64/include/asm/cpufeature.h
+++ b/arch/arm64/include/asm/cpufeature.h
@@ -774,6 +774,7 @@ static inline unsigned int get_vmid_bits(u64 mmfr1)
 }
 
 u32 get_kvm_ipa_limit(void);
+void dump_cpu_features(void);
 
 #endif /* __ASSEMBLY__ */
 
diff --git a/arch/arm64/include/asm/memory.h b/arch/arm64/include/asm/memory.h
index a1871bb32bb1..2a88cb734d06 100644
--- a/arch/arm64/include/asm/memory.h
+++ b/arch/arm64/include/asm/memory.h
@@ -322,6 +322,7 @@ static inline void *phys_to_virt(phys_addr_t x)
 	__is_lm_address(__addr) && pfn_valid(virt_to_pfn(__addr));	\
 })
 
+void dump_mem_limit(void);
 #endif /* !ASSEMBLY */
 
 /*
diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c
index 9f63053a63a9..9b79df930396 100644
--- a/arch/arm64/kernel/cpufeature.c
+++ b/arch/arm64/kernel/cpufeature.c
@@ -119,25 +119,12 @@ static inline void finalize_system_capabilities(void)
 	static_branch_enable(&arm64_const_caps_ready);
 }
 
-static int dump_cpu_hwcaps(struct notifier_block *self, unsigned long v, void *p)
+void dump_cpu_features(void)
 {
 	/* file-wide pr_fmt adds "CPU features: " prefix */
 	pr_emerg("0x%*pb\n", ARM64_NCAPS, &cpu_hwcaps);
-	return 0;
 }
 
-static struct notifier_block cpu_hwcaps_notifier = {
-	.notifier_call = dump_cpu_hwcaps
-};
-
-static int __init register_cpu_hwcaps_dumper(void)
-{
-	atomic_notifier_chain_register(&panic_notifier_list,
-				       &cpu_hwcaps_notifier);
-	return 0;
-}
-__initcall(register_cpu_hwcaps_dumper);
-
 DEFINE_STATIC_KEY_ARRAY_FALSE(cpu_hwcap_keys, ARM64_NCAPS);
 EXPORT_SYMBOL(cpu_hwcap_keys);
 
diff --git a/arch/arm64/kernel/setup.c b/arch/arm64/kernel/setup.c
index 93b3844cf442..c793276ec7ad 100644
--- a/arch/arm64/kernel/setup.c
+++ b/arch/arm64/kernel/setup.c
@@ -400,11 +400,7 @@ static int __init topology_init(void)
 }
 subsys_initcall(topology_init);
 
-/*
- * Dump out kernel offset information on panic.
- */
-static int dump_kernel_offset(struct notifier_block *self, unsigned long v,
-			      void *p)
+static void dump_kernel_offset(void)
 {
 	const unsigned long offset = kaslr_offset();
 
@@ -415,17 +411,25 @@ static int dump_kernel_offset(struct notifier_block *self, unsigned long v,
 	} else {
 		pr_emerg("Kernel Offset: disabled\n");
 	}
+}
+
+static int arm64_panic_block_dump(struct notifier_block *self,
+				  unsigned long v, void *p)
+{
+	dump_kernel_offset();
+	dump_cpu_features();
+	dump_mem_limit();
 	return 0;
 }
 
-static struct notifier_block kernel_offset_notifier = {
-	.notifier_call = dump_kernel_offset
+static struct notifier_block arm64_panic_block = {
+	.notifier_call = arm64_panic_block_dump
 };
 
-static int __init register_kernel_offset_dumper(void)
+static int __init register_arm64_panic_block(void)
 {
 	atomic_notifier_chain_register(&panic_notifier_list,
-				       &kernel_offset_notifier);
+				       &arm64_panic_block);
 	return 0;
 }
-__initcall(register_kernel_offset_dumper);
+device_initcall(register_arm64_panic_block);
diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c
index 1e93cfc7c47a..6c3eb424c613 100644
--- a/arch/arm64/mm/init.c
+++ b/arch/arm64/mm/init.c
@@ -563,27 +563,11 @@ void free_initmem(void)
 	unmap_kernel_range((u64)__init_begin, (u64)(__init_end - __init_begin));
 }
 
-/*
- * Dump out memory limit information on panic.
- */
-static int dump_mem_limit(struct notifier_block *self, unsigned long v, void *p)
+void dump_mem_limit(void)
 {
 	if (memory_limit != PHYS_ADDR_MAX) {
 		pr_emerg("Memory Limit: %llu MB\n", memory_limit >> 20);
 	} else {
 		pr_emerg("Memory Limit: none\n");
 	}
-	return 0;
 }
-
-static struct notifier_block mem_limit_notifier = {
-	.notifier_call = dump_mem_limit,
-};
-
-static int __init register_mem_limit_dumper(void)
-{
-	atomic_notifier_chain_register(&panic_notifier_list,
-				       &mem_limit_notifier);
-	return 0;
-}
-__initcall(register_mem_limit_dumper);

From 1d50e5d0c5052446cb85a3bf11fe8ba4e8d770ca Mon Sep 17 00:00:00 2001
From: Bhupesh Sharma <bhsharma@redhat.com>
Date: Thu, 14 May 2020 00:22:36 +0530
Subject: [PATCH 247/502] crash_core, vmcoreinfo: Append 'MAX_PHYSMEM_BITS' to
 vmcoreinfo

Right now user-space tools like 'makedumpfile' and 'crash' need to rely
on a best-guess method of determining value of 'MAX_PHYSMEM_BITS'
supported by underlying kernel.

This value is used in user-space code to calculate the bit-space
required to store a section for SPARESMEM (similar to the existing
calculation method used in the kernel implementation):

  #define SECTIONS_SHIFT    (MAX_PHYSMEM_BITS - SECTION_SIZE_BITS)

Now, regressions have been reported in user-space utilities
like 'makedumpfile' and 'crash' on arm64, with the recently added
kernel support for 52-bit physical address space, as there is
no clear method of determining this value in user-space
(other than reading kernel CONFIG flags).

As per suggestion from makedumpfile maintainer (Kazu), it makes more
sense to append 'MAX_PHYSMEM_BITS' to vmcoreinfo in the core code itself
rather than in arch-specific code, so that the user-space code for other
archs can also benefit from this addition to the vmcoreinfo and use it
as a standard way of determining 'SECTIONS_SHIFT' value in user-land.

A reference 'makedumpfile' implementation which reads the
'MAX_PHYSMEM_BITS' value from vmcoreinfo in a arch-independent fashion
is available here:

While at it also update vmcoreinfo documentation for 'MAX_PHYSMEM_BITS'
variable being added to vmcoreinfo.

'MAX_PHYSMEM_BITS' defines the maximum supported physical address
space memory.

Signed-off-by: Bhupesh Sharma <bhsharma@redhat.com>
Tested-by: John Donnelly <john.p.donnelly@oracle.com>
Acked-by: Dave Young <dyoung@redhat.com>
Cc: Boris Petkov <bp@alien8.de>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: James Morse <james.morse@arm.com>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Will Deacon <will@kernel.org>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Dave Anderson <anderson@redhat.com>
Cc: Kazuhito Hagio <k-hagio@ab.jp.nec.com>
Cc: x86@kernel.org
Cc: linuxppc-dev@lists.ozlabs.org
Cc: linux-arm-kernel@lists.infradead.org
Cc: linux-kernel@vger.kernel.org
Cc: kexec@lists.infradead.org
Link: https://lore.kernel.org/r/1589395957-24628-2-git-send-email-bhsharma@redhat.com
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 Documentation/admin-guide/kdump/vmcoreinfo.rst | 5 +++++
 kernel/crash_core.c                            | 1 +
 2 files changed, 6 insertions(+)

diff --git a/Documentation/admin-guide/kdump/vmcoreinfo.rst b/Documentation/admin-guide/kdump/vmcoreinfo.rst
index e4ee8b2db604..2a632020f809 100644
--- a/Documentation/admin-guide/kdump/vmcoreinfo.rst
+++ b/Documentation/admin-guide/kdump/vmcoreinfo.rst
@@ -93,6 +93,11 @@ It exists in the sparse memory mapping model, and it is also somewhat
 similar to the mem_map variable, both of them are used to translate an
 address.
 
+MAX_PHYSMEM_BITS
+----------------
+
+Defines the maximum supported physical address space memory.
+
 page
 ----
 
diff --git a/kernel/crash_core.c b/kernel/crash_core.c
index 9f1557b98468..18175687133a 100644
--- a/kernel/crash_core.c
+++ b/kernel/crash_core.c
@@ -413,6 +413,7 @@ static int __init crash_save_vmcoreinfo_init(void)
 	VMCOREINFO_LENGTH(mem_section, NR_SECTION_ROOTS);
 	VMCOREINFO_STRUCT_SIZE(mem_section);
 	VMCOREINFO_OFFSET(mem_section, section_mem_map);
+	VMCOREINFO_NUMBER(MAX_PHYSMEM_BITS);
 #endif
 	VMCOREINFO_STRUCT_SIZE(page);
 	VMCOREINFO_STRUCT_SIZE(pglist_data);

From bbdbc11804ff0b4130e7550113b452e96a74d16e Mon Sep 17 00:00:00 2001
From: Bhupesh Sharma <bhsharma@redhat.com>
Date: Thu, 14 May 2020 00:22:37 +0530
Subject: [PATCH 248/502] arm64/crash_core: Export TCR_EL1.T1SZ in vmcoreinfo

TCR_EL1.TxSZ, which controls the VA space size, is configured by a
single kernel image to support either 48-bit or 52-bit VA space.

If the ARMv8.2-LVA optional feature is present and we are running
with a 64KB page size, then it is possible to use 52-bits of address
space for both userspace and kernel addresses. However, any kernel
binary that supports 52-bit must also be able to fall back to 48-bit
at early boot time if the hardware feature is not present.

Since TCR_EL1.T1SZ indicates the size of the memory region addressed by
TTBR1_EL1, export the same in vmcoreinfo. User-space utilities like
makedumpfile and crash-utility need to read this value from vmcoreinfo
for determining if a virtual address lies in the linear map range.

While at it also add documentation for TCR_EL1.T1SZ variable being
added to vmcoreinfo.

It indicates the size offset of the memory region addressed by
TTBR1_EL1.

Signed-off-by: Bhupesh Sharma <bhsharma@redhat.com>
Tested-by: John Donnelly <john.p.donnelly@oracle.com>
Tested-by: Kamlakant Patel <kamlakantp@marvell.com>
Tested-by: Amit Daniel Kachhap <amit.kachhap@arm.com>
Reviewed-by: James Morse <james.morse@arm.com>
Reviewed-by: Amit Daniel Kachhap <amit.kachhap@arm.com>
Cc: James Morse <james.morse@arm.com>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Will Deacon <will@kernel.org>
Cc: Steve Capper <steve.capper@arm.com>
Cc: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Cc: Dave Anderson <anderson@redhat.com>
Cc: Kazuhito Hagio <k-hagio@ab.jp.nec.com>
Cc: linux-arm-kernel@lists.infradead.org
Cc: linux-kernel@vger.kernel.org
Cc: kexec@lists.infradead.org
Link: https://lore.kernel.org/r/1589395957-24628-3-git-send-email-bhsharma@redhat.com
[catalin.marinas@arm.com: removed vabits_actual from the commit log]
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 Documentation/admin-guide/kdump/vmcoreinfo.rst | 11 +++++++++++
 arch/arm64/include/asm/pgtable-hwdef.h         |  1 +
 arch/arm64/kernel/crash_core.c                 | 10 ++++++++++
 3 files changed, 22 insertions(+)

diff --git a/Documentation/admin-guide/kdump/vmcoreinfo.rst b/Documentation/admin-guide/kdump/vmcoreinfo.rst
index 2a632020f809..2baad0bfb09d 100644
--- a/Documentation/admin-guide/kdump/vmcoreinfo.rst
+++ b/Documentation/admin-guide/kdump/vmcoreinfo.rst
@@ -404,6 +404,17 @@ KERNELPACMASK
 The mask to extract the Pointer Authentication Code from a kernel virtual
 address.
 
+TCR_EL1.T1SZ
+------------
+
+Indicates the size offset of the memory region addressed by TTBR1_EL1.
+The region size is 2^(64-T1SZ) bytes.
+
+TTBR1_EL1 is the table base address register specified by ARMv8-A
+architecture which is used to lookup the page-tables for the Virtual
+addresses in the higher VA range (refer to ARMv8 ARM document for
+more details).
+
 arm
 ===
 
diff --git a/arch/arm64/include/asm/pgtable-hwdef.h b/arch/arm64/include/asm/pgtable-hwdef.h
index 9c91a8f93a0e..9a757d724974 100644
--- a/arch/arm64/include/asm/pgtable-hwdef.h
+++ b/arch/arm64/include/asm/pgtable-hwdef.h
@@ -216,6 +216,7 @@
 #define TCR_TxSZ(x)		(TCR_T0SZ(x) | TCR_T1SZ(x))
 #define TCR_TxSZ_WIDTH		6
 #define TCR_T0SZ_MASK		(((UL(1) << TCR_TxSZ_WIDTH) - 1) << TCR_T0SZ_OFFSET)
+#define TCR_T1SZ_MASK		(((UL(1) << TCR_TxSZ_WIDTH) - 1) << TCR_T1SZ_OFFSET)
 
 #define TCR_EPD0_SHIFT		7
 #define TCR_EPD0_MASK		(UL(1) << TCR_EPD0_SHIFT)
diff --git a/arch/arm64/kernel/crash_core.c b/arch/arm64/kernel/crash_core.c
index 1f646b07e3e9..314391a156ee 100644
--- a/arch/arm64/kernel/crash_core.c
+++ b/arch/arm64/kernel/crash_core.c
@@ -7,6 +7,14 @@
 #include <linux/crash_core.h>
 #include <asm/cpufeature.h>
 #include <asm/memory.h>
+#include <asm/pgtable-hwdef.h>
+
+static inline u64 get_tcr_el1_t1sz(void);
+
+static inline u64 get_tcr_el1_t1sz(void)
+{
+	return (read_sysreg(tcr_el1) & TCR_T1SZ_MASK) >> TCR_T1SZ_OFFSET;
+}
 
 void arch_crash_save_vmcoreinfo(void)
 {
@@ -16,6 +24,8 @@ void arch_crash_save_vmcoreinfo(void)
 						kimage_voffset);
 	vmcoreinfo_append_str("NUMBER(PHYS_OFFSET)=0x%llx\n",
 						PHYS_OFFSET);
+	vmcoreinfo_append_str("NUMBER(TCR_EL1_T1SZ)=0x%llx\n",
+						get_tcr_el1_t1sz());
 	vmcoreinfo_append_str("KERNELOFFSET=%lx\n", kaslr_offset());
 	vmcoreinfo_append_str("NUMBER(KERNELPACMASK)=0x%llx\n",
 						system_supports_address_auth() ?

From dd72078466ecd525f4d489e7b0093cd9b5044c8e Mon Sep 17 00:00:00 2001
From: Mark Brown <broonie@kernel.org>
Date: Thu, 25 Jun 2020 14:15:07 +0100
Subject: [PATCH 249/502] arm64: Document sysctls for emulated deprecated
 instructions

We have support for emulating a number of deprecated instructions in the
kernel with individual Kconfig options enabling this support per
instruction. In addition to the Kconfig options we also provide runtime
control via sysctls but this is not currently mentioned in the Kconfig so
not very discoverable for users. This is particularly important for
SWP/SWPB since this is disabled by default at runtime and must be enabled
via the sysctl, causing considerable frustration for users who have enabled
the config option and are then confused to find that the instruction is
still faulting.

Add a reference to the sysctls in the help text for each of the config
options, noting that SWP/SWPB is disabled by default, to improve the
user experience.

Signed-off-by: Mark Brown <broonie@kernel.org>
Link: https://lore.kernel.org/r/20200625131507.32334-1-broonie@kernel.org
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 arch/arm64/Kconfig | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index 66dc41fd49f2..6c560caf9503 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -1327,6 +1327,8 @@ config SWP_EMULATION
 	  ARMv8 obsoletes the use of A32 SWP/SWPB instructions such that
 	  they are always undefined. Say Y here to enable software
 	  emulation of these instructions for userspace using LDXR/STXR.
+	  This feature can be controlled at runtime with the abi.swp
+	  sysctl which is disabled by default.
 
 	  In some older versions of glibc [<=2.8] SWP is used during futex
 	  trylock() operations with the assumption that the code will not
@@ -1353,7 +1355,8 @@ config CP15_BARRIER_EMULATION
 	  Say Y here to enable software emulation of these
 	  instructions for AArch32 userspace code. When this option is
 	  enabled, CP15 barrier usage is traced which can help
-	  identify software that needs updating.
+	  identify software that needs updating. This feature can be
+	  controlled at runtime with the abi.cp15_barrier sysctl.
 
 	  If unsure, say Y
 
@@ -1364,7 +1367,8 @@ config SETEND_EMULATION
 	  AArch32 EL0, and is deprecated in ARMv8.
 
 	  Say Y here to enable software emulation of the instruction
-	  for AArch32 userspace code.
+	  for AArch32 userspace code. This feature can be controlled
+	  at runtime with the abi.setend sysctl.
 
 	  Note: All the cpus on the system must have mixed endian support at EL0
 	  for this feature to be enabled. If a new CPU - which doesn't support mixed

From 24840e76bf8a679d26d373a0edc44284bfd9dc18 Mon Sep 17 00:00:00 2001
From: Heiko Carstens <heiko.carstens@de.ibm.com>
Date: Wed, 1 Jul 2020 11:16:16 +0200
Subject: [PATCH 250/502] s390/smp: move smp_cpus_done() to header file

Saves us a couple of bytes.

Signed-off-by: Heiko Carstens <heiko.carstens@de.ibm.com>
---
 arch/s390/include/asm/smp.h | 4 ++++
 arch/s390/kernel/smp.c      | 4 ----
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/arch/s390/include/asm/smp.h b/arch/s390/include/asm/smp.h
index 7326f110d48c..20b37b059e2b 100644
--- a/arch/s390/include/asm/smp.h
+++ b/arch/s390/include/asm/smp.h
@@ -54,6 +54,10 @@ static inline int smp_get_base_cpu(int cpu)
 	return cpu - (cpu % (smp_cpu_mtid + 1));
 }
 
+static inline void smp_cpus_done(unsigned int max_cpus)
+{
+}
+
 extern int smp_rescan_cpus(void);
 extern void __noreturn cpu_die(void);
 extern void __cpu_die(unsigned int cpu);
diff --git a/arch/s390/kernel/smp.c b/arch/s390/kernel/smp.c
index e6be63ff162a..b4f2795a123d 100644
--- a/arch/s390/kernel/smp.c
+++ b/arch/s390/kernel/smp.c
@@ -1012,10 +1012,6 @@ void __init smp_prepare_boot_cpu(void)
 	smp_cpu_set_polarization(0, POLARIZATION_UNKNOWN);
 }
 
-void __init smp_cpus_done(unsigned int max_cpus)
-{
-}
-
 void __init smp_setup_processor_id(void)
 {
 	pcpu_devices[0].address = stap();

From 8e1398f8987851bb266c1d8d911752a18e1d05b4 Mon Sep 17 00:00:00 2001
From: Heiko Carstens <heiko.carstens@de.ibm.com>
Date: Wed, 1 Jul 2020 11:17:52 +0200
Subject: [PATCH 251/502] s390/smp: add missing linebreak

Signed-off-by: Heiko Carstens <heiko.carstens@de.ibm.com>
---
 arch/s390/kernel/smp.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/arch/s390/kernel/smp.c b/arch/s390/kernel/smp.c
index b4f2795a123d..f685a38f166d 100644
--- a/arch/s390/kernel/smp.c
+++ b/arch/s390/kernel/smp.c
@@ -1141,6 +1141,7 @@ static int smp_cpu_online(unsigned int cpu)
 
 	return sysfs_create_group(&s->kobj, &cpu_online_attr_group);
 }
+
 static int smp_cpu_pre_down(unsigned int cpu)
 {
 	struct device *s = &per_cpu(cpu_device, cpu)->dev;

From 0ef5d691aae0322cbab0807c184ba534536a4698 Mon Sep 17 00:00:00 2001
From: David Hildenbrand <david@redhat.com>
Date: Tue, 30 Jun 2020 10:42:40 +0200
Subject: [PATCH 252/502] s390/extmem: remove stale -ENOSPC comment and
 handling

segment_load() will no longer return -ENOSPC. If a segment overlaps with
storage, we now also return -EBUSY. Remove the stale comment from
__segment_load() and the stale handling from segment_warning().

Cc: Vasily Gorbik <gor@linux.ibm.com>
Cc: Christian Borntraeger <borntraeger@de.ibm.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Suggested-by: Gerald Schaefer <gerald.schaefer@de.ibm.com>
Signed-off-by: David Hildenbrand <david@redhat.com>
Message-Id: <20200630084240.8283-1-david@redhat.com>
Signed-off-by: Heiko Carstens <heiko.carstens@de.ibm.com>
---
 arch/s390/mm/extmem.c | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

diff --git a/arch/s390/mm/extmem.c b/arch/s390/mm/extmem.c
index 105c09282f8c..5060956b8e7d 100644
--- a/arch/s390/mm/extmem.c
+++ b/arch/s390/mm/extmem.c
@@ -401,8 +401,7 @@ __segment_load (char *name, int do_nonshared, unsigned long *addr, unsigned long
  * -EIO     : could not perform query or load diagnose
  * -ENOENT  : no such segment
  * -EOPNOTSUPP: multi-part segment cannot be used with linux
- * -ENOSPC  : segment cannot be used (overlaps with storage)
- * -EBUSY   : segment can temporarily not be used (overlaps with dcss)
+ * -EBUSY   : segment cannot be used (overlaps with dcss or storage)
  * -ERANGE  : segment cannot be used (exceeds kernel mapping range)
  * -EPERM   : segment is currently loaded with incompatible permissions
  * -ENOMEM  : out of memory
@@ -627,10 +626,6 @@ void segment_warning(int rc, char *seg_name)
 		pr_err("DCSS %s has multiple page ranges and cannot be "
 		       "loaded or queried\n", seg_name);
 		break;
-	case -ENOSPC:
-		pr_err("DCSS %s overlaps with used storage and cannot "
-		       "be loaded\n", seg_name);
-		break;
 	case -EBUSY:
 		pr_err("%s needs used memory resources and cannot be "
 		       "loaded or queried\n", seg_name);

From c6337c6e89a695819d94949a7170e1bd0d131e31 Mon Sep 17 00:00:00 2001
From: Harald Freudenberger <freude@linux.ibm.com>
Date: Tue, 30 Jun 2020 09:42:23 +0200
Subject: [PATCH 253/502] s390/pkey: fix smatch warning inconsistent indenting

Fix smatch warnings:
pkey_api.c:1606 pkey_ccacipher_aes_attr_read() warn: inconsistent indenting

Reported-by: kernel test robot <lkp@intel.com>
Signed-off-by: Harald Freudenberger <freude@linux.ibm.com>
Signed-off-by: Heiko Carstens <heiko.carstens@de.ibm.com>
---
 drivers/s390/crypto/pkey_api.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/s390/crypto/pkey_api.c b/drivers/s390/crypto/pkey_api.c
index 74e63ec49068..d5880f52dc2b 100644
--- a/drivers/s390/crypto/pkey_api.c
+++ b/drivers/s390/crypto/pkey_api.c
@@ -1603,8 +1603,8 @@ static ssize_t pkey_ccacipher_aes_attr_read(enum pkey_key_size keybits,
 		if (rc == 0)
 			break;
 	}
-		if (rc)
-			return rc;
+	if (rc)
+		return rc;
 
 	if (is_xts) {
 		keysize = CCACIPHERTOKENSIZE;

From 47c07bffeb32aa2a8e798d8ce25fa693e1364e11 Mon Sep 17 00:00:00 2001
From: Harald Freudenberger <freude@linux.ibm.com>
Date: Tue, 30 Jun 2020 09:54:50 +0200
Subject: [PATCH 254/502] s390/zcrypt: fix smatch warnings

Fix these smatch warnings:

zcrypt_api.c:986 _zcrypt_send_ep11_cprb() error: uninitialized symbol 'pref_weight'.
zcrypt_api.c:1008 _zcrypt_send_ep11_cprb() error: uninitialized symbol 'weight'.
zcrypt_api.c:676 zcrypt_rsa_modexpo() error: uninitialized symbol 'pref_weight'.
zcrypt_api.c:694 zcrypt_rsa_modexpo() error: uninitialized symbol 'weight'.
zcrypt_api.c:760 zcrypt_rsa_crt() error: uninitialized symbol 'pref_weight'.
zcrypt_api.c:778 zcrypt_rsa_crt() error: uninitialized symbol 'weight'.
zcrypt_api.c:824 _zcrypt_send_cprb() warn: always true condition '(tdom >= 0) => (0-u16max >= 0)'
zcrypt_api.c:846 _zcrypt_send_cprb() error: uninitialized symbol 'pref_weight'.
zcrypt_api.c:867 _zcrypt_send_cprb() error: uninitialized symbol 'weight'.
zcrypt_api.c:1065 zcrypt_rng() error: uninitialized symbol 'pref_weight'.
zcrypt_api.c:1079 zcrypt_rng() error: uninitialized symbol 'weight'.
zcrypt_cex4.c:251 ep11_card_op_modes_show() warn: should '(1 << ep11_op_modes[i]->mode_bit)' be a 64 bit type?
zcrypt_cex4.c:346 ep11_queue_op_modes_show() warn: should '(1 << ep11_op_modes[i]->mode_bit)' be a 64 bit type?

Reported-by: kernel test robot <lkp@intel.com>
Reported-by: Dan Carpenter <dan.carpenter@oracle.com>

Signed-off-by: Harald Freudenberger <freude@linux.ibm.com>
Signed-off-by: Heiko Carstens <heiko.carstens@de.ibm.com>
---
 drivers/s390/crypto/zcrypt_api.c  | 12 ++++++------
 drivers/s390/crypto/zcrypt_cex4.c |  4 ++--
 2 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/drivers/s390/crypto/zcrypt_api.c b/drivers/s390/crypto/zcrypt_api.c
index 56a405dce8bc..7775ff84f223 100644
--- a/drivers/s390/crypto/zcrypt_api.c
+++ b/drivers/s390/crypto/zcrypt_api.c
@@ -634,7 +634,7 @@ static long zcrypt_rsa_modexpo(struct ap_perms *perms,
 {
 	struct zcrypt_card *zc, *pref_zc;
 	struct zcrypt_queue *zq, *pref_zq;
-	unsigned int weight, pref_weight;
+	unsigned int weight = 0, pref_weight = 0;
 	unsigned int func_code;
 	int qid = 0, rc = -ENODEV;
 	struct module *mod;
@@ -718,7 +718,7 @@ static long zcrypt_rsa_crt(struct ap_perms *perms,
 {
 	struct zcrypt_card *zc, *pref_zc;
 	struct zcrypt_queue *zq, *pref_zq;
-	unsigned int weight, pref_weight;
+	unsigned int weight = 0, pref_weight = 0;
 	unsigned int func_code;
 	int qid = 0, rc = -ENODEV;
 	struct module *mod;
@@ -803,7 +803,7 @@ static long _zcrypt_send_cprb(struct ap_perms *perms,
 	struct zcrypt_card *zc, *pref_zc;
 	struct zcrypt_queue *zq, *pref_zq;
 	struct ap_message ap_msg;
-	unsigned int weight, pref_weight;
+	unsigned int weight = 0, pref_weight = 0;
 	unsigned int func_code;
 	unsigned short *domain, tdom;
 	int qid = 0, rc = -ENODEV;
@@ -822,7 +822,7 @@ static long _zcrypt_send_cprb(struct ap_perms *perms,
 	 * domain but a control only domain, use the default domain as target.
 	 */
 	tdom = *domain;
-	if (tdom >= 0 && tdom < AP_DOMAINS &&
+	if (tdom < AP_DOMAINS &&
 	    !ap_test_config_usage_domain(tdom) &&
 	    ap_test_config_ctrl_domain(tdom) &&
 	    ap_domain_index >= 0)
@@ -931,7 +931,7 @@ static long _zcrypt_send_ep11_cprb(struct ap_perms *perms,
 	struct zcrypt_queue *zq, *pref_zq;
 	struct ep11_target_dev *targets;
 	unsigned short target_num;
-	unsigned int weight, pref_weight;
+	unsigned int weight = 0, pref_weight = 0;
 	unsigned int func_code;
 	struct ap_message ap_msg;
 	int qid = 0, rc = -ENODEV;
@@ -1040,7 +1040,7 @@ static long zcrypt_rng(char *buffer)
 {
 	struct zcrypt_card *zc, *pref_zc;
 	struct zcrypt_queue *zq, *pref_zq;
-	unsigned int weight, pref_weight;
+	unsigned int weight = 0, pref_weight = 0;
 	unsigned int func_code;
 	struct ap_message ap_msg;
 	unsigned int domain;
diff --git a/drivers/s390/crypto/zcrypt_cex4.c b/drivers/s390/crypto/zcrypt_cex4.c
index cdaa8348ad04..337ec71ddb58 100644
--- a/drivers/s390/crypto/zcrypt_cex4.c
+++ b/drivers/s390/crypto/zcrypt_cex4.c
@@ -250,7 +250,7 @@ static ssize_t ep11_card_op_modes_show(struct device *dev,
 	ep11_get_card_info(ac->id, &ci, zc->online);
 
 	for (i = 0; ep11_op_modes[i].mode_txt; i++) {
-		if (ci.op_mode & (1 << ep11_op_modes[i].mode_bit)) {
+		if (ci.op_mode & (1ULL << ep11_op_modes[i].mode_bit)) {
 			if (n > 0)
 				buf[n++] = ' ';
 			n += scnprintf(buf + n, PAGE_SIZE - n,
@@ -345,7 +345,7 @@ static ssize_t ep11_queue_op_modes_show(struct device *dev,
 				     &di);
 
 	for (i = 0; ep11_op_modes[i].mode_txt; i++) {
-		if (di.op_mode & (1 << ep11_op_modes[i].mode_bit)) {
+		if (di.op_mode & (1ULL << ep11_op_modes[i].mode_bit)) {
 			if (n > 0)
 				buf[n++] = ' ';
 			n += scnprintf(buf + n, PAGE_SIZE - n,

From 74ecbef7b90800e368809642ecc671ba4a57ab09 Mon Sep 17 00:00:00 2001
From: Harald Freudenberger <freude@linux.ibm.com>
Date: Thu, 30 Apr 2020 12:23:29 +0200
Subject: [PATCH 255/502] s390/zcrypt: code beautification and struct field
 renames

Some beautifications related to the internal only used
struct ap_message and related code. Instead of one int carrying
only the special flag now a u32 flags field is used.

At struct CPRBX the pointers to additional data are now marked
with __user. This caused some changes needed on code, where
these structs are also used within the zcrypt misc functions.

The ica_rsa_* structs now use the generic types __u8, __u32, ...
instead of char, unsigned int.

zcrypt_msg6 and zcrypt_msg50 use min_t() instead of min().

Signed-off-by: Harald Freudenberger <freude@linux.ibm.com>
Signed-off-by: Heiko Carstens <heiko.carstens@de.ibm.com>
---
 arch/s390/include/uapi/asm/zcrypt.h    | 140 ++++++++++++-------------
 drivers/s390/crypto/ap_bus.h           |  11 +-
 drivers/s390/crypto/ap_queue.c         |   9 +-
 drivers/s390/crypto/zcrypt_ccamisc.c   |  69 ++++++------
 drivers/s390/crypto/zcrypt_cex2c.c     |  15 ++-
 drivers/s390/crypto/zcrypt_error.h     |   4 +-
 drivers/s390/crypto/zcrypt_msgtype50.c |  64 +++++------
 drivers/s390/crypto/zcrypt_msgtype6.c  | 112 ++++++++++----------
 drivers/s390/crypto/zcrypt_msgtype6.h  |   4 +-
 drivers/s390/crypto/zcrypt_queue.c     |   8 +-
 10 files changed, 217 insertions(+), 219 deletions(-)

diff --git a/arch/s390/include/uapi/asm/zcrypt.h b/arch/s390/include/uapi/asm/zcrypt.h
index 5a2177e96e88..22fd202856bc 100644
--- a/arch/s390/include/uapi/asm/zcrypt.h
+++ b/arch/s390/include/uapi/asm/zcrypt.h
@@ -36,12 +36,12 @@
  * - length(n_modulus) = inputdatalength
  */
 struct ica_rsa_modexpo {
-	char __user  *inputdata;
-	unsigned int  inputdatalength;
-	char __user  *outputdata;
-	unsigned int  outputdatalength;
-	char __user  *b_key;
-	char __user  *n_modulus;
+	__u8 __user  *inputdata;
+	__u32	      inputdatalength;
+	__u8 __user  *outputdata;
+	__u32	      outputdatalength;
+	__u8 __user  *b_key;
+	__u8 __user  *n_modulus;
 };
 
 /**
@@ -59,15 +59,15 @@ struct ica_rsa_modexpo {
  * - length(u_mult_inv) = inputdatalength/2 + 8
  */
 struct ica_rsa_modexpo_crt {
-	char __user  *inputdata;
-	unsigned int  inputdatalength;
-	char __user  *outputdata;
-	unsigned int  outputdatalength;
-	char __user  *bp_key;
-	char __user  *bq_key;
-	char __user  *np_prime;
-	char __user  *nq_prime;
-	char __user  *u_mult_inv;
+	__u8 __user  *inputdata;
+	__u32	      inputdatalength;
+	__u8 __user  *outputdata;
+	__u32	      outputdatalength;
+	__u8 __user  *bp_key;
+	__u8 __user  *bq_key;
+	__u8 __user  *np_prime;
+	__u8 __user  *nq_prime;
+	__u8 __user  *u_mult_inv;
 };
 
 /**
@@ -83,67 +83,67 @@ struct ica_rsa_modexpo_crt {
  *	    key block
  */
 struct CPRBX {
-	unsigned short	cprb_len;	/* CPRB length	      220	 */
-	unsigned char	cprb_ver_id;	/* CPRB version id.   0x02	 */
-	unsigned char	pad_000[3];	/* Alignment pad bytes		 */
-	unsigned char	func_id[2];	/* function id	      0x5432	 */
-	unsigned char	cprb_flags[4];	/* Flags			 */
-	unsigned int	req_parml;	/* request parameter buffer len	 */
-	unsigned int	req_datal;	/* request data buffer		 */
-	unsigned int	rpl_msgbl;	/* reply  message block length	 */
-	unsigned int	rpld_parml;	/* replied parameter block len	 */
-	unsigned int	rpl_datal;	/* reply data block len		 */
-	unsigned int	rpld_datal;	/* replied data block len	 */
-	unsigned int	req_extbl;	/* request extension block len	 */
-	unsigned char	pad_001[4];	/* reserved			 */
-	unsigned int	rpld_extbl;	/* replied extension block len	 */
-	unsigned char	padx000[16 - sizeof(char *)];
-	unsigned char  *req_parmb;	/* request parm block 'address'	 */
-	unsigned char	padx001[16 - sizeof(char *)];
-	unsigned char  *req_datab;	/* request data block 'address'	 */
-	unsigned char	padx002[16 - sizeof(char *)];
-	unsigned char  *rpl_parmb;	/* reply parm block 'address'	 */
-	unsigned char	padx003[16 - sizeof(char *)];
-	unsigned char  *rpl_datab;	/* reply data block 'address'	 */
-	unsigned char	padx004[16 - sizeof(char *)];
-	unsigned char  *req_extb;	/* request extension block 'addr'*/
-	unsigned char	padx005[16 - sizeof(char *)];
-	unsigned char  *rpl_extb;	/* reply extension block 'address'*/
-	unsigned short	ccp_rtcode;	/* server return code		 */
-	unsigned short	ccp_rscode;	/* server reason code		 */
-	unsigned int	mac_data_len;	/* Mac Data Length		 */
-	unsigned char	logon_id[8];	/* Logon Identifier		 */
-	unsigned char	mac_value[8];	/* Mac Value			 */
-	unsigned char	mac_content_flgs;/* Mac content flag byte	 */
-	unsigned char	pad_002;	/* Alignment			 */
-	unsigned short	domain;		/* Domain			 */
-	unsigned char	usage_domain[4];/* Usage domain			 */
-	unsigned char	cntrl_domain[4];/* Control domain		 */
-	unsigned char	S390enf_mask[4];/* S/390 enforcement mask	 */
-	unsigned char	pad_004[36];	/* reserved			 */
+	__u16	     cprb_len;		/* CPRB length	      220	 */
+	__u8	     cprb_ver_id;	/* CPRB version id.   0x02	 */
+	__u8	     pad_000[3];	/* Alignment pad bytes		 */
+	__u8	     func_id[2];	/* function id	      0x5432	 */
+	__u8	     cprb_flags[4];	/* Flags			 */
+	__u32	     req_parml;		/* request parameter buffer len	 */
+	__u32	     req_datal;		/* request data buffer		 */
+	__u32	     rpl_msgbl;		/* reply  message block length	 */
+	__u32	     rpld_parml;	/* replied parameter block len	 */
+	__u32	     rpl_datal;		/* reply data block len		 */
+	__u32	     rpld_datal;	/* replied data block len	 */
+	__u32	     req_extbl;		/* request extension block len	 */
+	__u8	     pad_001[4];	/* reserved			 */
+	__u32	     rpld_extbl;	/* replied extension block len	 */
+	__u8	     padx000[16 - sizeof(__u8 *)];
+	__u8 __user *req_parmb;		/* request parm block 'address'	 */
+	__u8	     padx001[16 - sizeof(__u8 *)];
+	__u8 __user *req_datab;		/* request data block 'address'	 */
+	__u8	     padx002[16 - sizeof(__u8 *)];
+	__u8 __user *rpl_parmb;		/* reply parm block 'address'	 */
+	__u8	     padx003[16 - sizeof(__u8 *)];
+	__u8 __user *rpl_datab;		/* reply data block 'address'	 */
+	__u8	     padx004[16 - sizeof(__u8 *)];
+	__u8 __user *req_extb;		/* request extension block 'addr'*/
+	__u8	     padx005[16 - sizeof(__u8 *)];
+	__u8 __user *rpl_extb;		/* reply extension block 'address'*/
+	__u16	     ccp_rtcode;	/* server return code		 */
+	__u16	     ccp_rscode;	/* server reason code		 */
+	__u32	     mac_data_len;	/* Mac Data Length		 */
+	__u8	     logon_id[8];	/* Logon Identifier		 */
+	__u8	     mac_value[8];	/* Mac Value			 */
+	__u8	     mac_content_flgs;	/* Mac content flag byte	 */
+	__u8	     pad_002;		/* Alignment			 */
+	__u16	     domain;		/* Domain			 */
+	__u8	     usage_domain[4];	/* Usage domain			 */
+	__u8	     cntrl_domain[4];	/* Control domain		 */
+	__u8	     S390enf_mask[4];	/* S/390 enforcement mask	 */
+	__u8	     pad_004[36];	/* reserved			 */
 } __attribute__((packed));
 
 /**
  * xcRB
  */
 struct ica_xcRB {
-	unsigned short	agent_ID;
-	unsigned int	user_defined;
-	unsigned short	request_ID;
-	unsigned int	request_control_blk_length;
-	unsigned char	padding1[16 - sizeof(char *)];
-	char __user    *request_control_blk_addr;
-	unsigned int	request_data_length;
-	char		padding2[16 - sizeof(char *)];
-	char __user    *request_data_address;
-	unsigned int	reply_control_blk_length;
-	char		padding3[16 - sizeof(char *)];
-	char __user    *reply_control_blk_addr;
-	unsigned int	reply_data_length;
-	char		padding4[16 - sizeof(char *)];
-	char __user    *reply_data_addr;
-	unsigned short	priority_window;
-	unsigned int	status;
+	__u16	      agent_ID;
+	__u32	      user_defined;
+	__u16	      request_ID;
+	__u32	      request_control_blk_length;
+	__u8	      _padding1[16 - sizeof(__u8 *)];
+	__u8 __user  *request_control_blk_addr;
+	__u32	      request_data_length;
+	__u8	      _padding2[16 - sizeof(__u8 *)];
+	__u8 __user  *request_data_address;
+	__u32	      reply_control_blk_length;
+	__u8	      _padding3[16 - sizeof(__u8 *)];
+	__u8 __user  *reply_control_blk_addr;
+	__u32	      reply_data_length;
+	__u8	      __padding4[16 - sizeof(__u8 *)];
+	__u8 __user  *reply_data_addr;
+	__u16	      priority_window;
+	__u32	      status;
 } __attribute__((packed));
 
 /**
diff --git a/drivers/s390/crypto/ap_bus.h b/drivers/s390/crypto/ap_bus.h
index 053cc34d2ca2..69432e93643a 100644
--- a/drivers/s390/crypto/ap_bus.h
+++ b/drivers/s390/crypto/ap_bus.h
@@ -190,17 +190,18 @@ typedef enum ap_wait (ap_func_t)(struct ap_queue *queue);
 struct ap_message {
 	struct list_head list;		/* Request queueing. */
 	unsigned long long psmid;	/* Message id. */
-	void *message;			/* Pointer to message buffer. */
-	size_t length;			/* Message length. */
+	void *msg;			/* Pointer to message buffer. */
+	unsigned int len;		/* Message length. */
+	u32 flags;			/* Flags, see AP_MSG_FLAG_xxx */
 	int rc;				/* Return code for this message */
-
 	void *private;			/* ap driver private pointer. */
-	unsigned int special:1;		/* Used for special commands. */
 	/* receive is called from tasklet context */
 	void (*receive)(struct ap_queue *, struct ap_message *,
 			struct ap_message *);
 };
 
+#define AP_MSG_FLAG_SPECIAL  (1 << 16)	/* flag msg as 'special' with NQAP */
+
 /**
  * ap_init_message() - Initialize ap_message.
  * Initialize a message before using. Otherwise this might result in
@@ -218,7 +219,7 @@ static inline void ap_init_message(struct ap_message *ap_msg)
  */
 static inline void ap_release_message(struct ap_message *ap_msg)
 {
-	kzfree(ap_msg->message);
+	kzfree(ap_msg->msg);
 	kzfree(ap_msg->private);
 }
 
diff --git a/drivers/s390/crypto/ap_queue.c b/drivers/s390/crypto/ap_queue.c
index 73b077dca3e6..d6cc384f294b 100644
--- a/drivers/s390/crypto/ap_queue.c
+++ b/drivers/s390/crypto/ap_queue.c
@@ -69,9 +69,9 @@ static int ap_queue_enable_interruption(struct ap_queue *aq, void *ind)
  */
 static inline struct ap_queue_status
 __ap_send(ap_qid_t qid, unsigned long long psmid, void *msg, size_t length,
-	  unsigned int special)
+	  int special)
 {
-	if (special == 1)
+	if (special)
 		qid |= 0x400000UL;
 	return ap_nqap(qid, psmid, msg, length);
 }
@@ -137,7 +137,7 @@ static struct ap_queue_status ap_sm_recv(struct ap_queue *aq)
 	struct ap_message *ap_msg;
 
 	status = ap_dqap(aq->qid, &aq->reply->psmid,
-			 aq->reply->message, aq->reply->length);
+			 aq->reply->msg, aq->reply->len);
 	switch (status.response_code) {
 	case AP_RESPONSE_NORMAL:
 		aq->queue_count--;
@@ -216,7 +216,8 @@ static enum ap_wait ap_sm_write(struct ap_queue *aq)
 	/* Start the next request on the queue. */
 	ap_msg = list_entry(aq->requestq.next, struct ap_message, list);
 	status = __ap_send(aq->qid, ap_msg->psmid,
-			   ap_msg->message, ap_msg->length, ap_msg->special);
+			   ap_msg->msg, ap_msg->len,
+			   ap_msg->flags & AP_MSG_FLAG_SPECIAL);
 	switch (status.response_code) {
 	case AP_RESPONSE_NORMAL:
 		aq->queue_count++;
diff --git a/drivers/s390/crypto/zcrypt_ccamisc.c b/drivers/s390/crypto/zcrypt_ccamisc.c
index 1b835398feec..3f5b61351cde 100644
--- a/drivers/s390/crypto/zcrypt_ccamisc.c
+++ b/drivers/s390/crypto/zcrypt_ccamisc.c
@@ -205,9 +205,9 @@ static int alloc_and_prep_cprbmem(size_t paramblen,
 	preqcblk->rpl_msgbl = cprbplusparamblen;
 	if (paramblen) {
 		preqcblk->req_parmb =
-			((u8 *) preqcblk) + sizeof(struct CPRBX);
+			((u8 __user *) preqcblk) + sizeof(struct CPRBX);
 		preqcblk->rpl_parmb =
-			((u8 *) prepcblk) + sizeof(struct CPRBX);
+			((u8 __user *) prepcblk) + sizeof(struct CPRBX);
 	}
 
 	*pcprbmem = cprbmem;
@@ -274,7 +274,7 @@ int cca_genseckey(u16 cardnr, u16 domain,
 {
 	int i, rc, keysize;
 	int seckeysize;
-	u8 *mem;
+	u8 *mem, *ptr;
 	struct CPRBX *preqcblk, *prepcblk;
 	struct ica_xcRB xcrb;
 	struct kgreqparm {
@@ -320,7 +320,7 @@ int cca_genseckey(u16 cardnr, u16 domain,
 	preqcblk->domain = domain;
 
 	/* fill request cprb param block with KG request */
-	preqparm = (struct kgreqparm *) preqcblk->req_parmb;
+	preqparm = (struct kgreqparm __force *) preqcblk->req_parmb;
 	memcpy(preqparm->subfunc_code, "KG", 2);
 	preqparm->rule_array_len = sizeof(preqparm->rule_array_len);
 	preqparm->lv1.len = sizeof(struct lv1);
@@ -377,8 +377,9 @@ int cca_genseckey(u16 cardnr, u16 domain,
 	}
 
 	/* process response cprb param block */
-	prepcblk->rpl_parmb = ((u8 *) prepcblk) + sizeof(struct CPRBX);
-	prepparm = (struct kgrepparm *) prepcblk->rpl_parmb;
+	ptr =  ((u8 *) prepcblk) + sizeof(struct CPRBX);
+	prepcblk->rpl_parmb = (u8 __user *) ptr;
+	prepparm = (struct kgrepparm *) ptr;
 
 	/* check length of the returned secure key token */
 	seckeysize = prepparm->lv3.keyblock.toklen
@@ -415,7 +416,7 @@ int cca_clr2seckey(u16 cardnr, u16 domain, u32 keybitsize,
 		   const u8 *clrkey, u8 seckey[SECKEYBLOBSIZE])
 {
 	int rc, keysize, seckeysize;
-	u8 *mem;
+	u8 *mem, *ptr;
 	struct CPRBX *preqcblk, *prepcblk;
 	struct ica_xcRB xcrb;
 	struct cmreqparm {
@@ -460,7 +461,7 @@ int cca_clr2seckey(u16 cardnr, u16 domain, u32 keybitsize,
 	preqcblk->domain = domain;
 
 	/* fill request cprb param block with CM request */
-	preqparm = (struct cmreqparm *) preqcblk->req_parmb;
+	preqparm = (struct cmreqparm __force *) preqcblk->req_parmb;
 	memcpy(preqparm->subfunc_code, "CM", 2);
 	memcpy(preqparm->rule_array, "AES     ", 8);
 	preqparm->rule_array_len =
@@ -514,8 +515,9 @@ int cca_clr2seckey(u16 cardnr, u16 domain, u32 keybitsize,
 	}
 
 	/* process response cprb param block */
-	prepcblk->rpl_parmb = ((u8 *) prepcblk) + sizeof(struct CPRBX);
-	prepparm = (struct cmrepparm *) prepcblk->rpl_parmb;
+	ptr = ((u8 *) prepcblk) + sizeof(struct CPRBX);
+	prepcblk->rpl_parmb = (u8 __user *) ptr;
+	prepparm = (struct cmrepparm *) ptr;
 
 	/* check length of the returned secure key token */
 	seckeysize = prepparm->lv3.keyblock.toklen
@@ -554,7 +556,7 @@ int cca_sec2protkey(u16 cardnr, u16 domain,
 		    u8 *protkey, u32 *protkeylen, u32 *protkeytype)
 {
 	int rc;
-	u8 *mem;
+	u8 *mem, *ptr;
 	struct CPRBX *preqcblk, *prepcblk;
 	struct ica_xcRB xcrb;
 	struct uskreqparm {
@@ -605,7 +607,7 @@ int cca_sec2protkey(u16 cardnr, u16 domain,
 	preqcblk->domain = domain;
 
 	/* fill request cprb param block with USK request */
-	preqparm = (struct uskreqparm *) preqcblk->req_parmb;
+	preqparm = (struct uskreqparm __force *) preqcblk->req_parmb;
 	memcpy(preqparm->subfunc_code, "US", 2);
 	preqparm->rule_array_len = sizeof(preqparm->rule_array_len);
 	preqparm->lv1.len = sizeof(struct lv1);
@@ -646,8 +648,9 @@ int cca_sec2protkey(u16 cardnr, u16 domain,
 	}
 
 	/* process response cprb param block */
-	prepcblk->rpl_parmb = ((u8 *) prepcblk) + sizeof(struct CPRBX);
-	prepparm = (struct uskrepparm *) prepcblk->rpl_parmb;
+	ptr = ((u8 *) prepcblk) + sizeof(struct CPRBX);
+	prepcblk->rpl_parmb = (u8 __user *) ptr;
+	prepparm = (struct uskrepparm *) ptr;
 
 	/* check the returned keyblock */
 	if (prepparm->lv3.ckb.version != 0x01 &&
@@ -714,7 +717,7 @@ int cca_gencipherkey(u16 cardnr, u16 domain, u32 keybitsize, u32 keygenflags,
 		     u8 *keybuf, size_t *keybufsize)
 {
 	int rc;
-	u8 *mem;
+	u8 *mem, *ptr;
 	struct CPRBX *preqcblk, *prepcblk;
 	struct ica_xcRB xcrb;
 	struct gkreqparm {
@@ -796,7 +799,7 @@ int cca_gencipherkey(u16 cardnr, u16 domain, u32 keybitsize, u32 keygenflags,
 	preqcblk->req_parml = sizeof(struct gkreqparm);
 
 	/* prepare request param block with GK request */
-	preqparm = (struct gkreqparm *) preqcblk->req_parmb;
+	preqparm = (struct gkreqparm __force *) preqcblk->req_parmb;
 	memcpy(preqparm->subfunc_code, "GK", 2);
 	preqparm->rule_array_len =  sizeof(uint16_t) + 2 * 8;
 	memcpy(preqparm->rule_array, "AES     OP      ", 2*8);
@@ -867,8 +870,9 @@ int cca_gencipherkey(u16 cardnr, u16 domain, u32 keybitsize, u32 keygenflags,
 	}
 
 	/* process response cprb param block */
-	prepcblk->rpl_parmb = ((u8 *) prepcblk) + sizeof(struct CPRBX);
-	prepparm = (struct gkrepparm *) prepcblk->rpl_parmb;
+	ptr = ((u8 *) prepcblk) + sizeof(struct CPRBX);
+	prepcblk->rpl_parmb = (u8 __user *) ptr;
+	prepparm = (struct gkrepparm *) ptr;
 
 	/* do some plausibility checks on the key block */
 	if (prepparm->kb.len < 120 + 5 * sizeof(uint16_t) ||
@@ -917,7 +921,7 @@ static int _ip_cprb_helper(u16 cardnr, u16 domain,
 			   int *key_token_size)
 {
 	int rc, n;
-	u8 *mem;
+	u8 *mem, *ptr;
 	struct CPRBX *preqcblk, *prepcblk;
 	struct ica_xcRB xcrb;
 	struct rule_array_block {
@@ -974,7 +978,7 @@ static int _ip_cprb_helper(u16 cardnr, u16 domain,
 	preqcblk->req_parml = 0;
 
 	/* prepare request param block with IP request */
-	preq_ra_block = (struct rule_array_block *) preqcblk->req_parmb;
+	preq_ra_block = (struct rule_array_block __force *) preqcblk->req_parmb;
 	memcpy(preq_ra_block->subfunc_code, "IP", 2);
 	preq_ra_block->rule_array_len =  sizeof(uint16_t) + 2 * 8;
 	memcpy(preq_ra_block->rule_array, rule_array_1, 8);
@@ -987,7 +991,7 @@ static int _ip_cprb_helper(u16 cardnr, u16 domain,
 	}
 
 	/* prepare vud block */
-	preq_vud_block = (struct vud_block *)
+	preq_vud_block = (struct vud_block __force *)
 		(preqcblk->req_parmb + preqcblk->req_parml);
 	n = complete ? 0 : (clr_key_bit_size + 7) / 8;
 	preq_vud_block->len = sizeof(struct vud_block) + n;
@@ -1001,7 +1005,7 @@ static int _ip_cprb_helper(u16 cardnr, u16 domain,
 	preqcblk->req_parml += preq_vud_block->len;
 
 	/* prepare key block */
-	preq_key_block = (struct key_block *)
+	preq_key_block = (struct key_block __force *)
 		(preqcblk->req_parmb + preqcblk->req_parml);
 	n = *key_token_size;
 	preq_key_block->len = sizeof(struct key_block) + n;
@@ -1034,8 +1038,9 @@ static int _ip_cprb_helper(u16 cardnr, u16 domain,
 	}
 
 	/* process response cprb param block */
-	prepcblk->rpl_parmb = ((u8 *) prepcblk) + sizeof(struct CPRBX);
-	prepparm = (struct iprepparm *) prepcblk->rpl_parmb;
+	ptr = ((u8 *) prepcblk) + sizeof(struct CPRBX);
+	prepcblk->rpl_parmb = (u8 __user *) ptr;
+	prepparm = (struct iprepparm *) ptr;
 
 	/* do some plausibility checks on the key block */
 	if (prepparm->kb.len < 120 + 3 * sizeof(uint16_t) ||
@@ -1151,7 +1156,7 @@ int cca_cipher2protkey(u16 cardnr, u16 domain, const u8 *ckey,
 		       u8 *protkey, u32 *protkeylen, u32 *protkeytype)
 {
 	int rc;
-	u8 *mem;
+	u8 *mem, *ptr;
 	struct CPRBX *preqcblk, *prepcblk;
 	struct ica_xcRB xcrb;
 	struct aureqparm {
@@ -1208,7 +1213,7 @@ int cca_cipher2protkey(u16 cardnr, u16 domain, const u8 *ckey,
 	preqcblk->domain = domain;
 
 	/* fill request cprb param block with AU request */
-	preqparm = (struct aureqparm *) preqcblk->req_parmb;
+	preqparm = (struct aureqparm __force *) preqcblk->req_parmb;
 	memcpy(preqparm->subfunc_code, "AU", 2);
 	preqparm->rule_array_len =
 		sizeof(preqparm->rule_array_len)
@@ -1257,8 +1262,9 @@ int cca_cipher2protkey(u16 cardnr, u16 domain, const u8 *ckey,
 	}
 
 	/* process response cprb param block */
-	prepcblk->rpl_parmb = ((u8 *) prepcblk) + sizeof(struct CPRBX);
-	prepparm = (struct aurepparm *) prepcblk->rpl_parmb;
+	ptr = ((u8 *) prepcblk) + sizeof(struct CPRBX);
+	prepcblk->rpl_parmb = (u8 __user *) ptr;
+	prepparm = (struct aurepparm *) ptr;
 
 	/* check the returned keyblock */
 	if (prepparm->vud.ckb.version != 0x01 &&
@@ -1347,7 +1353,7 @@ int cca_query_crypto_facility(u16 cardnr, u16 domain,
 	preqcblk->domain = domain;
 
 	/* fill request cprb param block with FQ request */
-	preqparm = (struct fqreqparm *) preqcblk->req_parmb;
+	preqparm = (struct fqreqparm __force *) preqcblk->req_parmb;
 	memcpy(preqparm->subfunc_code, "FQ", 2);
 	memcpy(preqparm->rule_array, keyword, sizeof(preqparm->rule_array));
 	preqparm->rule_array_len =
@@ -1378,8 +1384,9 @@ int cca_query_crypto_facility(u16 cardnr, u16 domain,
 	}
 
 	/* process response cprb param block */
-	prepcblk->rpl_parmb = ((u8 *) prepcblk) + sizeof(struct CPRBX);
-	prepparm = (struct fqrepparm *) prepcblk->rpl_parmb;
+	ptr = ((u8 *) prepcblk) + sizeof(struct CPRBX);
+	prepcblk->rpl_parmb = (u8 __user *) ptr;
+	prepparm = (struct fqrepparm *) ptr;
 	ptr = prepparm->lvdata;
 
 	/* check and possibly copy reply rule array */
diff --git a/drivers/s390/crypto/zcrypt_cex2c.c b/drivers/s390/crypto/zcrypt_cex2c.c
index 266440168bb7..993addb726e0 100644
--- a/drivers/s390/crypto/zcrypt_cex2c.c
+++ b/drivers/s390/crypto/zcrypt_cex2c.c
@@ -87,24 +87,23 @@ static int zcrypt_cex2c_rng_supported(struct ap_queue *aq)
 	int rc, i;
 
 	ap_init_message(&ap_msg);
-	ap_msg.message = (void *) get_zeroed_page(GFP_KERNEL);
-	if (!ap_msg.message)
+	ap_msg.msg = (void *) get_zeroed_page(GFP_KERNEL);
+	if (!ap_msg.msg)
 		return -ENOMEM;
 
 	rng_type6CPRB_msgX(&ap_msg, 4, &domain);
 
-	msg = ap_msg.message;
+	msg = ap_msg.msg;
 	msg->cprbx.domain = AP_QID_QUEUE(aq->qid);
 
-	rc = ap_send(aq->qid, 0x0102030405060708ULL, ap_msg.message,
-		     ap_msg.length);
+	rc = ap_send(aq->qid, 0x0102030405060708ULL, ap_msg.msg, ap_msg.len);
 	if (rc)
 		goto out_free;
 
 	/* Wait for the test message to complete. */
 	for (i = 0; i < 2 * HZ; i++) {
 		msleep(1000 / HZ);
-		rc = ap_recv(aq->qid, &psmid, ap_msg.message, 4096);
+		rc = ap_recv(aq->qid, &psmid, ap_msg.msg, 4096);
 		if (rc == 0 && psmid == 0x0102030405060708ULL)
 			break;
 	}
@@ -115,13 +114,13 @@ static int zcrypt_cex2c_rng_supported(struct ap_queue *aq)
 		goto out_free;
 	}
 
-	reply = ap_msg.message;
+	reply = ap_msg.msg;
 	if (reply->cprbx.ccp_rtcode == 0 && reply->cprbx.ccp_rscode == 0)
 		rc = 1;
 	else
 		rc = 0;
 out_free:
-	free_page((unsigned long) ap_msg.message);
+	free_page((unsigned long) ap_msg.msg);
 	return rc;
 }
 
diff --git a/drivers/s390/crypto/zcrypt_error.h b/drivers/s390/crypto/zcrypt_error.h
index 4f4dd9d727c9..54a04f8c38ef 100644
--- a/drivers/s390/crypto/zcrypt_error.h
+++ b/drivers/s390/crypto/zcrypt_error.h
@@ -80,7 +80,7 @@ struct error_hdr {
 static inline int convert_error(struct zcrypt_queue *zq,
 				struct ap_message *reply)
 {
-	struct error_hdr *ehdr = reply->message;
+	struct error_hdr *ehdr = reply->msg;
 	int card = AP_QID_CARD(zq->queue->qid);
 	int queue = AP_QID_QUEUE(zq->queue->qid);
 
@@ -127,7 +127,7 @@ static inline int convert_error(struct zcrypt_queue *zq,
 			struct {
 				struct type86_hdr hdr;
 				struct type86_fmt2_ext fmt2;
-			} __packed * head = reply->message;
+			} __packed * head = reply->msg;
 			unsigned int apfs = *((u32 *)head->fmt2.apfs);
 
 			ZCRYPT_DBF(DBF_ERR,
diff --git a/drivers/s390/crypto/zcrypt_msgtype50.c b/drivers/s390/crypto/zcrypt_msgtype50.c
index fc4295b3d801..7aedc338b445 100644
--- a/drivers/s390/crypto/zcrypt_msgtype50.c
+++ b/drivers/s390/crypto/zcrypt_msgtype50.c
@@ -207,10 +207,10 @@ static int ICAMEX_msg_to_type50MEX_msg(struct zcrypt_queue *zq,
 	mod_len = mex->inputdatalength;
 
 	if (mod_len <= 128) {
-		struct type50_meb1_msg *meb1 = ap_msg->message;
+		struct type50_meb1_msg *meb1 = ap_msg->msg;
 
 		memset(meb1, 0, sizeof(*meb1));
-		ap_msg->length = sizeof(*meb1);
+		ap_msg->len = sizeof(*meb1);
 		meb1->header.msg_type_code = TYPE50_TYPE_CODE;
 		meb1->header.msg_len = sizeof(*meb1);
 		meb1->keyblock_type = TYPE50_MEB1_FMT;
@@ -218,10 +218,10 @@ static int ICAMEX_msg_to_type50MEX_msg(struct zcrypt_queue *zq,
 		exp = meb1->exponent + sizeof(meb1->exponent) - mod_len;
 		inp = meb1->message + sizeof(meb1->message) - mod_len;
 	} else if (mod_len <= 256) {
-		struct type50_meb2_msg *meb2 = ap_msg->message;
+		struct type50_meb2_msg *meb2 = ap_msg->msg;
 
 		memset(meb2, 0, sizeof(*meb2));
-		ap_msg->length = sizeof(*meb2);
+		ap_msg->len = sizeof(*meb2);
 		meb2->header.msg_type_code = TYPE50_TYPE_CODE;
 		meb2->header.msg_len = sizeof(*meb2);
 		meb2->keyblock_type = TYPE50_MEB2_FMT;
@@ -229,10 +229,10 @@ static int ICAMEX_msg_to_type50MEX_msg(struct zcrypt_queue *zq,
 		exp = meb2->exponent + sizeof(meb2->exponent) - mod_len;
 		inp = meb2->message + sizeof(meb2->message) - mod_len;
 	} else if (mod_len <= 512) {
-		struct type50_meb3_msg *meb3 = ap_msg->message;
+		struct type50_meb3_msg *meb3 = ap_msg->msg;
 
 		memset(meb3, 0, sizeof(*meb3));
-		ap_msg->length = sizeof(*meb3);
+		ap_msg->len = sizeof(*meb3);
 		meb3->header.msg_type_code = TYPE50_TYPE_CODE;
 		meb3->header.msg_len = sizeof(*meb3);
 		meb3->keyblock_type = TYPE50_MEB3_FMT;
@@ -275,10 +275,10 @@ static int ICACRT_msg_to_type50CRT_msg(struct zcrypt_queue *zq,
 	 * 512 byte modulus (4k keys).
 	 */
 	if (mod_len <= 128) {		/* up to 1024 bit key size */
-		struct type50_crb1_msg *crb1 = ap_msg->message;
+		struct type50_crb1_msg *crb1 = ap_msg->msg;
 
 		memset(crb1, 0, sizeof(*crb1));
-		ap_msg->length = sizeof(*crb1);
+		ap_msg->len = sizeof(*crb1);
 		crb1->header.msg_type_code = TYPE50_TYPE_CODE;
 		crb1->header.msg_len = sizeof(*crb1);
 		crb1->keyblock_type = TYPE50_CRB1_FMT;
@@ -289,10 +289,10 @@ static int ICACRT_msg_to_type50CRT_msg(struct zcrypt_queue *zq,
 		u = crb1->u + sizeof(crb1->u) - short_len;
 		inp = crb1->message + sizeof(crb1->message) - mod_len;
 	} else if (mod_len <= 256) {	/* up to 2048 bit key size */
-		struct type50_crb2_msg *crb2 = ap_msg->message;
+		struct type50_crb2_msg *crb2 = ap_msg->msg;
 
 		memset(crb2, 0, sizeof(*crb2));
-		ap_msg->length = sizeof(*crb2);
+		ap_msg->len = sizeof(*crb2);
 		crb2->header.msg_type_code = TYPE50_TYPE_CODE;
 		crb2->header.msg_len = sizeof(*crb2);
 		crb2->keyblock_type = TYPE50_CRB2_FMT;
@@ -304,10 +304,10 @@ static int ICACRT_msg_to_type50CRT_msg(struct zcrypt_queue *zq,
 		inp = crb2->message + sizeof(crb2->message) - mod_len;
 	} else if ((mod_len <= 512) &&	/* up to 4096 bit key size */
 		   (zq->zcard->max_mod_size == CEX3A_MAX_MOD_SIZE)) {
-		struct type50_crb3_msg *crb3 = ap_msg->message;
+		struct type50_crb3_msg *crb3 = ap_msg->msg;
 
 		memset(crb3, 0, sizeof(*crb3));
-		ap_msg->length = sizeof(*crb3);
+		ap_msg->len = sizeof(*crb3);
 		crb3->header.msg_type_code = TYPE50_TYPE_CODE;
 		crb3->header.msg_len = sizeof(*crb3);
 		crb3->keyblock_type = TYPE50_CRB3_FMT;
@@ -350,7 +350,7 @@ static int convert_type80(struct zcrypt_queue *zq,
 			  char __user *outputdata,
 			  unsigned int outputdatalength)
 {
-	struct type80_hdr *t80h = reply->message;
+	struct type80_hdr *t80h = reply->msg;
 	unsigned char *data;
 
 	if (t80h->len < sizeof(*t80h) + outputdatalength) {
@@ -370,7 +370,7 @@ static int convert_type80(struct zcrypt_queue *zq,
 		BUG_ON(t80h->len > CEX2A_MAX_RESPONSE_SIZE);
 	else
 		BUG_ON(t80h->len > CEX3A_MAX_RESPONSE_SIZE);
-	data = reply->message + t80h->len - outputdatalength;
+	data = reply->msg + t80h->len - outputdatalength;
 	if (copy_to_user(outputdata, data, outputdatalength))
 		return -EFAULT;
 	return 0;
@@ -382,7 +382,7 @@ static int convert_response(struct zcrypt_queue *zq,
 			    unsigned int outputdatalength)
 {
 	/* Response type byte is the second byte in the response. */
-	unsigned char rtype = ((unsigned char *) reply->message)[1];
+	unsigned char rtype = ((unsigned char *) reply->msg)[1];
 
 	switch (rtype) {
 	case TYPE82_RSP_CODE:
@@ -422,22 +422,20 @@ static void zcrypt_cex2a_receive(struct ap_queue *aq,
 		.reply_code = REP82_ERROR_MACHINE_FAILURE,
 	};
 	struct type80_hdr *t80h;
-	int length;
+	int len;
 
 	/* Copy the reply message to the request message buffer. */
 	if (!reply)
 		goto out;	/* ap_msg->rc indicates the error */
-	t80h = reply->message;
+	t80h = reply->msg;
 	if (t80h->type == TYPE80_RSP_CODE) {
 		if (aq->ap_dev.device_type == AP_DEVICE_TYPE_CEX2A)
-			length = min_t(int,
-				       CEX2A_MAX_RESPONSE_SIZE, t80h->len);
+			len = min_t(int, CEX2A_MAX_RESPONSE_SIZE, t80h->len);
 		else
-			length = min_t(int,
-				       CEX3A_MAX_RESPONSE_SIZE, t80h->len);
-		memcpy(msg->message, reply->message, length);
+			len = min_t(int, CEX3A_MAX_RESPONSE_SIZE, t80h->len);
+		memcpy(msg->msg, reply->msg, len);
 	} else
-		memcpy(msg->message, reply->message, sizeof(error_reply));
+		memcpy(msg->msg, reply->msg, sizeof(error_reply));
 out:
 	complete((struct completion *) msg->private);
 }
@@ -460,12 +458,10 @@ static long zcrypt_cex2a_modexpo(struct zcrypt_queue *zq,
 
 	ap_init_message(&ap_msg);
 	if (zq->zcard->user_space_type == ZCRYPT_CEX2A)
-		ap_msg.message = kmalloc(MSGTYPE50_CRB2_MAX_MSG_SIZE,
-					 GFP_KERNEL);
+		ap_msg.msg = kmalloc(MSGTYPE50_CRB2_MAX_MSG_SIZE, GFP_KERNEL);
 	else
-		ap_msg.message = kmalloc(MSGTYPE50_CRB3_MAX_MSG_SIZE,
-					 GFP_KERNEL);
-	if (!ap_msg.message)
+		ap_msg.msg = kmalloc(MSGTYPE50_CRB3_MAX_MSG_SIZE, GFP_KERNEL);
+	if (!ap_msg.msg)
 		return -ENOMEM;
 	ap_msg.receive = zcrypt_cex2a_receive;
 	ap_msg.psmid = (((unsigned long long) current->pid) << 32) +
@@ -486,7 +482,7 @@ static long zcrypt_cex2a_modexpo(struct zcrypt_queue *zq,
 		/* Signal pending. */
 		ap_cancel_message(zq->queue, &ap_msg);
 out_free:
-	kfree(ap_msg.message);
+	kfree(ap_msg.msg);
 	return rc;
 }
 
@@ -506,12 +502,10 @@ static long zcrypt_cex2a_modexpo_crt(struct zcrypt_queue *zq,
 
 	ap_init_message(&ap_msg);
 	if (zq->zcard->user_space_type == ZCRYPT_CEX2A)
-		ap_msg.message = kmalloc(MSGTYPE50_CRB2_MAX_MSG_SIZE,
-					 GFP_KERNEL);
+		ap_msg.msg = kmalloc(MSGTYPE50_CRB2_MAX_MSG_SIZE, GFP_KERNEL);
 	else
-		ap_msg.message = kmalloc(MSGTYPE50_CRB3_MAX_MSG_SIZE,
-					 GFP_KERNEL);
-	if (!ap_msg.message)
+		ap_msg.msg = kmalloc(MSGTYPE50_CRB3_MAX_MSG_SIZE, GFP_KERNEL);
+	if (!ap_msg.msg)
 		return -ENOMEM;
 	ap_msg.receive = zcrypt_cex2a_receive;
 	ap_msg.psmid = (((unsigned long long) current->pid) << 32) +
@@ -532,7 +526,7 @@ static long zcrypt_cex2a_modexpo_crt(struct zcrypt_queue *zq,
 		/* Signal pending. */
 		ap_cancel_message(zq->queue, &ap_msg);
 out_free:
-	kfree(ap_msg.message);
+	kfree(ap_msg.msg);
 	return rc;
 }
 
diff --git a/drivers/s390/crypto/zcrypt_msgtype6.c b/drivers/s390/crypto/zcrypt_msgtype6.c
index fd1cbb2d6b3f..d77991c74c25 100644
--- a/drivers/s390/crypto/zcrypt_msgtype6.c
+++ b/drivers/s390/crypto/zcrypt_msgtype6.c
@@ -266,7 +266,7 @@ static int ICAMEX_msg_to_type6MEX_msgX(struct zcrypt_queue *zq,
 		struct function_and_rules_block fr;
 		unsigned short length;
 		char text[0];
-	} __packed * msg = ap_msg->message;
+	} __packed * msg = ap_msg->msg;
 	int size;
 
 	/*
@@ -301,7 +301,7 @@ static int ICAMEX_msg_to_type6MEX_msgX(struct zcrypt_queue *zq,
 
 	msg->cprbx.req_parml = size - sizeof(msg->hdr) - sizeof(msg->cprbx);
 
-	ap_msg->length = size;
+	ap_msg->len = size;
 	return 0;
 }
 
@@ -336,7 +336,7 @@ static int ICACRT_msg_to_type6CRT_msgX(struct zcrypt_queue *zq,
 		struct function_and_rules_block fr;
 		unsigned short length;
 		char text[0];
-	} __packed * msg = ap_msg->message;
+	} __packed * msg = ap_msg->msg;
 	int size;
 
 	/*
@@ -370,7 +370,7 @@ static int ICACRT_msg_to_type6CRT_msgX(struct zcrypt_queue *zq,
 
 	msg->fr = static_pkd_fnr;
 
-	ap_msg->length = size;
+	ap_msg->len = size;
 	return 0;
 }
 
@@ -400,11 +400,11 @@ static int XCRB_msg_to_type6CPRB_msgX(struct ap_message *ap_msg,
 	struct {
 		struct type6_hdr hdr;
 		struct CPRBX cprbx;
-	} __packed * msg = ap_msg->message;
+	} __packed * msg = ap_msg->msg;
 
 	int rcblen = CEIL4(xcRB->request_control_blk_length);
 	int replylen, req_sumlen, resp_sumlen;
-	char *req_data = ap_msg->message + sizeof(struct type6_hdr) + rcblen;
+	char *req_data = ap_msg->msg + sizeof(struct type6_hdr) + rcblen;
 	char *function_code;
 
 	if (CEIL4(xcRB->request_control_blk_length) <
@@ -412,10 +412,10 @@ static int XCRB_msg_to_type6CPRB_msgX(struct ap_message *ap_msg,
 		return -EINVAL; /* overflow after alignment*/
 
 	/* length checks */
-	ap_msg->length = sizeof(struct type6_hdr) +
+	ap_msg->len = sizeof(struct type6_hdr) +
 		CEIL4(xcRB->request_control_blk_length) +
 		xcRB->request_data_length;
-	if (ap_msg->length > MSGTYPE06_MAX_MSG_SIZE)
+	if (ap_msg->len > MSGTYPE06_MAX_MSG_SIZE)
 		return -EINVAL;
 
 	/*
@@ -480,9 +480,7 @@ static int XCRB_msg_to_type6CPRB_msgX(struct ap_message *ap_msg,
 
 	if (memcmp(function_code, "US", 2) == 0
 	    || memcmp(function_code, "AU", 2) == 0)
-		ap_msg->special = 1;
-	else
-		ap_msg->special = 0;
+		ap_msg->flags |= AP_MSG_FLAG_SPECIAL;
 
 	/* copy data block */
 	if (xcRB->request_data_length &&
@@ -512,7 +510,7 @@ static int xcrb_msg_to_type6_ep11cprb_msgx(struct ap_message *ap_msg,
 		struct ep11_cprb cprbx;
 		unsigned char	pld_tag;	/* fixed value 0x30 */
 		unsigned char	pld_lenfmt;	/* payload length format */
-	} __packed * msg = ap_msg->message;
+	} __packed * msg = ap_msg->msg;
 
 	struct pld_hdr {
 		unsigned char	func_tag;	/* fixed value 0x4 */
@@ -527,7 +525,7 @@ static int xcrb_msg_to_type6_ep11cprb_msgx(struct ap_message *ap_msg,
 		return -EINVAL; /* overflow after alignment*/
 
 	/* length checks */
-	ap_msg->length = sizeof(struct type6_hdr) + xcRB->req_len;
+	ap_msg->len = sizeof(struct type6_hdr) + xcRB->req_len;
 	if (CEIL4(xcRB->req_len) > MSGTYPE06_MAX_MSG_SIZE -
 				   (sizeof(struct type6_hdr)))
 		return -EINVAL;
@@ -569,7 +567,7 @@ static int xcrb_msg_to_type6_ep11cprb_msgx(struct ap_message *ap_msg,
 
 	/* enable special processing based on the cprbs flags special bit */
 	if (msg->cprbx.flags & 0x20)
-		ap_msg->special = 1;
+		ap_msg->flags |= AP_MSG_FLAG_SPECIAL;
 
 	return 0;
 }
@@ -639,7 +637,7 @@ static int convert_type86_ica(struct zcrypt_queue *zq,
 		0x35, 0x9D, 0xD3, 0xD3, 0xA7, 0x9D, 0x5D, 0x41,
 		0x6F, 0x65, 0x1B, 0xCF, 0xA9, 0x87, 0x91, 0x09
 	};
-	struct type86x_reply *msg = reply->message;
+	struct type86x_reply *msg = reply->msg;
 	unsigned short service_rc, service_rs;
 	unsigned int reply_len, pad_len;
 	char *data;
@@ -713,8 +711,8 @@ static int convert_type86_xcrb(struct zcrypt_queue *zq,
 			       struct ap_message *reply,
 			       struct ica_xcRB *xcRB)
 {
-	struct type86_fmt2_msg *msg = reply->message;
-	char *data = reply->message;
+	struct type86_fmt2_msg *msg = reply->msg;
+	char *data = reply->msg;
 
 	/* Copy CPRB to user */
 	if (copy_to_user(xcRB->reply_control_blk_addr,
@@ -744,8 +742,8 @@ static int convert_type86_ep11_xcrb(struct zcrypt_queue *zq,
 				    struct ap_message *reply,
 				    struct ep11_urb *xcRB)
 {
-	struct type86_fmt2_msg *msg = reply->message;
-	char *data = reply->message;
+	struct type86_fmt2_msg *msg = reply->msg;
+	char *data = reply->msg;
 
 	if (xcRB->resp_len < msg->fmt2.count1)
 		return -EINVAL;
@@ -766,8 +764,8 @@ static int convert_type86_rng(struct zcrypt_queue *zq,
 		struct type86_hdr hdr;
 		struct type86_fmt2_ext fmt2;
 		struct CPRBX cprbx;
-	} __packed * msg = reply->message;
-	char *data = reply->message;
+	} __packed * msg = reply->msg;
+	char *data = reply->msg;
 
 	if (msg->cprbx.ccp_rtcode != 0 || msg->cprbx.ccp_rscode != 0)
 		return -EINVAL;
@@ -780,7 +778,7 @@ static int convert_response_ica(struct zcrypt_queue *zq,
 			    char __user *outputdata,
 			    unsigned int outputdatalength)
 {
-	struct type86x_reply *msg = reply->message;
+	struct type86x_reply *msg = reply->msg;
 
 	switch (msg->hdr.type) {
 	case TYPE82_RSP_CODE:
@@ -820,7 +818,7 @@ static int convert_response_xcrb(struct zcrypt_queue *zq,
 			    struct ap_message *reply,
 			    struct ica_xcRB *xcRB)
 {
-	struct type86x_reply *msg = reply->message;
+	struct type86x_reply *msg = reply->msg;
 
 	switch (msg->hdr.type) {
 	case TYPE82_RSP_CODE:
@@ -853,7 +851,7 @@ static int convert_response_xcrb(struct zcrypt_queue *zq,
 static int convert_response_ep11_xcrb(struct zcrypt_queue *zq,
 	struct ap_message *reply, struct ep11_urb *xcRB)
 {
-	struct type86_ep11_reply *msg = reply->message;
+	struct type86_ep11_reply *msg = reply->msg;
 
 	switch (msg->hdr.type) {
 	case TYPE82_RSP_CODE:
@@ -883,7 +881,7 @@ static int convert_response_rng(struct zcrypt_queue *zq,
 				 struct ap_message *reply,
 				 char *data)
 {
-	struct type86x_reply *msg = reply->message;
+	struct type86x_reply *msg = reply->msg;
 
 	switch (msg->hdr.type) {
 	case TYPE82_RSP_CODE:
@@ -928,32 +926,30 @@ static void zcrypt_msgtype6_receive(struct ap_queue *aq,
 	struct response_type *resp_type =
 		(struct response_type *) msg->private;
 	struct type86x_reply *t86r;
-	int length;
+	int len;
 
 	/* Copy the reply message to the request message buffer. */
 	if (!reply)
 		goto out;	/* ap_msg->rc indicates the error */
-	t86r = reply->message;
+	t86r = reply->msg;
 	if (t86r->hdr.type == TYPE86_RSP_CODE &&
 		 t86r->cprbx.cprb_ver_id == 0x02) {
 		switch (resp_type->type) {
 		case CEXXC_RESPONSE_TYPE_ICA:
-			length = sizeof(struct type86x_reply)
-				+ t86r->length - 2;
-			length = min(CEXXC_MAX_ICA_RESPONSE_SIZE, length);
-			memcpy(msg->message, reply->message, length);
+			len = sizeof(struct type86x_reply) + t86r->length - 2;
+			len = min_t(int, CEXXC_MAX_ICA_RESPONSE_SIZE, len);
+			memcpy(msg->msg, reply->msg, len);
 			break;
 		case CEXXC_RESPONSE_TYPE_XCRB:
-			length = t86r->fmt2.offset2 + t86r->fmt2.count2;
-			length = min(MSGTYPE06_MAX_MSG_SIZE, length);
-			memcpy(msg->message, reply->message, length);
+			len = t86r->fmt2.offset2 + t86r->fmt2.count2;
+			len = min_t(int, MSGTYPE06_MAX_MSG_SIZE, len);
+			memcpy(msg->msg, reply->msg, len);
 			break;
 		default:
-			memcpy(msg->message, &error_reply,
-			       sizeof(error_reply));
+			memcpy(msg->msg, &error_reply, sizeof(error_reply));
 		}
 	} else
-		memcpy(msg->message, reply->message, sizeof(error_reply));
+		memcpy(msg->msg, reply->msg, sizeof(error_reply));
 out:
 	complete(&(resp_type->work));
 }
@@ -977,25 +973,25 @@ static void zcrypt_msgtype6_receive_ep11(struct ap_queue *aq,
 	struct response_type *resp_type =
 		(struct response_type *)msg->private;
 	struct type86_ep11_reply *t86r;
-	int length;
+	int len;
 
 	/* Copy the reply message to the request message buffer. */
 	if (!reply)
 		goto out;	/* ap_msg->rc indicates the error */
-	t86r = reply->message;
+	t86r = reply->msg;
 	if (t86r->hdr.type == TYPE86_RSP_CODE &&
 	    t86r->cprbx.cprb_ver_id == 0x04) {
 		switch (resp_type->type) {
 		case CEXXC_RESPONSE_TYPE_EP11:
-			length = t86r->fmt2.offset1 + t86r->fmt2.count1;
-			length = min(MSGTYPE06_MAX_MSG_SIZE, length);
-			memcpy(msg->message, reply->message, length);
+			len = t86r->fmt2.offset1 + t86r->fmt2.count1;
+			len = min_t(int, MSGTYPE06_MAX_MSG_SIZE, len);
+			memcpy(msg->msg, reply->msg, len);
 			break;
 		default:
-			memcpy(msg->message, &error_reply, sizeof(error_reply));
+			memcpy(msg->msg, &error_reply, sizeof(error_reply));
 		}
 	} else {
-		memcpy(msg->message, reply->message, sizeof(error_reply));
+		memcpy(msg->msg, reply->msg, sizeof(error_reply));
 	}
 out:
 	complete(&(resp_type->work));
@@ -1020,8 +1016,8 @@ static long zcrypt_msgtype6_modexpo(struct zcrypt_queue *zq,
 	int rc;
 
 	ap_init_message(&ap_msg);
-	ap_msg.message = (void *) get_zeroed_page(GFP_KERNEL);
-	if (!ap_msg.message)
+	ap_msg.msg = (void *) get_zeroed_page(GFP_KERNEL);
+	if (!ap_msg.msg)
 		return -ENOMEM;
 	ap_msg.receive = zcrypt_msgtype6_receive;
 	ap_msg.psmid = (((unsigned long long) current->pid) << 32) +
@@ -1043,7 +1039,7 @@ static long zcrypt_msgtype6_modexpo(struct zcrypt_queue *zq,
 		/* Signal pending. */
 		ap_cancel_message(zq->queue, &ap_msg);
 out_free:
-	free_page((unsigned long) ap_msg.message);
+	free_page((unsigned long) ap_msg.msg);
 	return rc;
 }
 
@@ -1064,8 +1060,8 @@ static long zcrypt_msgtype6_modexpo_crt(struct zcrypt_queue *zq,
 	int rc;
 
 	ap_init_message(&ap_msg);
-	ap_msg.message = (void *) get_zeroed_page(GFP_KERNEL);
-	if (!ap_msg.message)
+	ap_msg.msg = (void *) get_zeroed_page(GFP_KERNEL);
+	if (!ap_msg.msg)
 		return -ENOMEM;
 	ap_msg.receive = zcrypt_msgtype6_receive;
 	ap_msg.psmid = (((unsigned long long) current->pid) << 32) +
@@ -1088,7 +1084,7 @@ static long zcrypt_msgtype6_modexpo_crt(struct zcrypt_queue *zq,
 		ap_cancel_message(zq->queue, &ap_msg);
 	}
 out_free:
-	free_page((unsigned long) ap_msg.message);
+	free_page((unsigned long) ap_msg.msg);
 	return rc;
 }
 
@@ -1107,8 +1103,8 @@ unsigned int get_cprb_fc(struct ica_xcRB *xcRB,
 		.type = CEXXC_RESPONSE_TYPE_XCRB,
 	};
 
-	ap_msg->message = kmalloc(MSGTYPE06_MAX_MSG_SIZE, GFP_KERNEL);
-	if (!ap_msg->message)
+	ap_msg->msg = kmalloc(MSGTYPE06_MAX_MSG_SIZE, GFP_KERNEL);
+	if (!ap_msg->msg)
 		return -ENOMEM;
 	ap_msg->receive = zcrypt_msgtype6_receive;
 	ap_msg->psmid = (((unsigned long long) current->pid) << 32) +
@@ -1162,8 +1158,8 @@ unsigned int get_ep11cprb_fc(struct ep11_urb *xcrb,
 		.type = CEXXC_RESPONSE_TYPE_EP11,
 	};
 
-	ap_msg->message = kmalloc(MSGTYPE06_MAX_MSG_SIZE, GFP_KERNEL);
-	if (!ap_msg->message)
+	ap_msg->msg = kmalloc(MSGTYPE06_MAX_MSG_SIZE, GFP_KERNEL);
+	if (!ap_msg->msg)
 		return -ENOMEM;
 	ap_msg->receive = zcrypt_msgtype6_receive_ep11;
 	ap_msg->psmid = (((unsigned long long) current->pid) << 32) +
@@ -1193,7 +1189,7 @@ static long zcrypt_msgtype6_send_ep11_cprb(struct zcrypt_queue *zq,
 		struct ep11_cprb cprbx;
 		unsigned char	pld_tag;	/* fixed value 0x30 */
 		unsigned char	pld_lenfmt;	/* payload length format */
-	} __packed * msg = ap_msg->message;
+	} __packed * msg = ap_msg->msg;
 	struct pld_hdr {
 		unsigned char	func_tag;	/* fixed value 0x4 */
 		unsigned char	func_len;	/* fixed value 0x4 */
@@ -1256,8 +1252,8 @@ unsigned int get_rng_fc(struct ap_message *ap_msg, int *func_code,
 		.type = CEXXC_RESPONSE_TYPE_XCRB,
 	};
 
-	ap_msg->message = kmalloc(MSGTYPE06_MAX_MSG_SIZE, GFP_KERNEL);
-	if (!ap_msg->message)
+	ap_msg->msg = kmalloc(MSGTYPE06_MAX_MSG_SIZE, GFP_KERNEL);
+	if (!ap_msg->msg)
 		return -ENOMEM;
 	ap_msg->receive = zcrypt_msgtype6_receive;
 	ap_msg->psmid = (((unsigned long long) current->pid) << 32) +
@@ -1290,7 +1286,7 @@ static long zcrypt_msgtype6_rng(struct zcrypt_queue *zq,
 		char rule[8];
 		short int verb_length;
 		short int key_length;
-	} __packed * msg = ap_msg->message;
+	} __packed * msg = ap_msg->msg;
 	struct response_type *rtype = (struct response_type *)(ap_msg->private);
 	int rc;
 
diff --git a/drivers/s390/crypto/zcrypt_msgtype6.h b/drivers/s390/crypto/zcrypt_msgtype6.h
index 41a0df5f070f..0de280a81dd4 100644
--- a/drivers/s390/crypto/zcrypt_msgtype6.h
+++ b/drivers/s390/crypto/zcrypt_msgtype6.h
@@ -127,7 +127,7 @@ static inline void rng_type6CPRB_msgX(struct ap_message *ap_msg,
 		char rule[8];
 		short int verb_length;
 		short int key_length;
-	} __packed * msg = ap_msg->message;
+	} __packed * msg = ap_msg->msg;
 	static struct type6_hdr static_type6_hdrX = {
 		.type		= 0x06,
 		.offset1	= 0x00000058,
@@ -154,7 +154,7 @@ static inline void rng_type6CPRB_msgX(struct ap_message *ap_msg,
 	memcpy(msg->rule, "RANDOM  ", 8);
 	msg->verb_length = 0x02;
 	msg->key_length = 0x02;
-	ap_msg->length = sizeof(*msg);
+	ap_msg->len = sizeof(*msg);
 	*domain = (unsigned short)msg->cprbx.domain;
 }
 
diff --git a/drivers/s390/crypto/zcrypt_queue.c b/drivers/s390/crypto/zcrypt_queue.c
index b7d9fa567880..8bae6ad159a7 100644
--- a/drivers/s390/crypto/zcrypt_queue.c
+++ b/drivers/s390/crypto/zcrypt_queue.c
@@ -107,10 +107,10 @@ struct zcrypt_queue *zcrypt_queue_alloc(size_t max_response_size)
 	zq = kzalloc(sizeof(struct zcrypt_queue), GFP_KERNEL);
 	if (!zq)
 		return NULL;
-	zq->reply.message = kmalloc(max_response_size, GFP_KERNEL);
-	if (!zq->reply.message)
+	zq->reply.msg = kmalloc(max_response_size, GFP_KERNEL);
+	if (!zq->reply.msg)
 		goto out_free;
-	zq->reply.length = max_response_size;
+	zq->reply.len = max_response_size;
 	INIT_LIST_HEAD(&zq->list);
 	kref_init(&zq->refcount);
 	return zq;
@@ -123,7 +123,7 @@ EXPORT_SYMBOL(zcrypt_queue_alloc);
 
 void zcrypt_queue_free(struct zcrypt_queue *zq)
 {
-	kfree(zq->reply.message);
+	kfree(zq->reply.msg);
 	kfree(zq);
 }
 EXPORT_SYMBOL(zcrypt_queue_free);

From 7e202acb5c4397b17e275c017f84e4df34314578 Mon Sep 17 00:00:00 2001
From: Harald Freudenberger <freude@linux.ibm.com>
Date: Wed, 20 May 2020 16:07:19 +0200
Subject: [PATCH 256/502] s390/zcrypt: split ioctl function into smaller code
 units

The zcrpyt_unlocked_ioctl() function has become large. So split away
into new static functions the 4 ioctl ICARSAMODEXPO, ICARSACRT,
ZSECSENDCPRB and ZSENDEP11CPRB. This makes the code more readable and
is a preparation step for further improvements needed on these ioctls.

Signed-off-by: Harald Freudenberger <freude@linux.ibm.com>
Signed-off-by: Heiko Carstens <heiko.carstens@de.ibm.com>
---
 drivers/s390/crypto/zcrypt_api.c | 182 +++++++++++++++++--------------
 1 file changed, 101 insertions(+), 81 deletions(-)

diff --git a/drivers/s390/crypto/zcrypt_api.c b/drivers/s390/crypto/zcrypt_api.c
index 7775ff84f223..4dbbfd88262c 100644
--- a/drivers/s390/crypto/zcrypt_api.c
+++ b/drivers/s390/crypto/zcrypt_api.c
@@ -1298,6 +1298,99 @@ static int zcrypt_requestq_count(void)
 	return requestq_count;
 }
 
+static int icarsamodexpo_ioctl(struct ap_perms *perms, unsigned long arg)
+{
+	int rc;
+	struct ica_rsa_modexpo mex;
+	struct ica_rsa_modexpo __user *umex = (void __user *) arg;
+
+	if (copy_from_user(&mex, umex, sizeof(mex)))
+		return -EFAULT;
+	do {
+		rc = zcrypt_rsa_modexpo(perms, &mex);
+	} while (rc == -EAGAIN);
+	/* on failure: retry once again after a requested rescan */
+	if ((rc == -ENODEV) && (zcrypt_process_rescan()))
+		do {
+			rc = zcrypt_rsa_modexpo(perms, &mex);
+		} while (rc == -EAGAIN);
+	if (rc) {
+		ZCRYPT_DBF(DBF_DEBUG, "ioctl ICARSAMODEXPO rc=%d\n", rc);
+		return rc;
+	}
+	return put_user(mex.outputdatalength, &umex->outputdatalength);
+}
+
+static int icarsacrt_ioctl(struct ap_perms *perms, unsigned long arg)
+{
+	int rc;
+	struct ica_rsa_modexpo_crt crt;
+	struct ica_rsa_modexpo_crt __user *ucrt = (void __user *) arg;
+
+	if (copy_from_user(&crt, ucrt, sizeof(crt)))
+		return -EFAULT;
+	do {
+		rc = zcrypt_rsa_crt(perms, &crt);
+	} while (rc == -EAGAIN);
+	/* on failure: retry once again after a requested rescan */
+	if ((rc == -ENODEV) && (zcrypt_process_rescan()))
+		do {
+			rc = zcrypt_rsa_crt(perms, &crt);
+		} while (rc == -EAGAIN);
+	if (rc) {
+		ZCRYPT_DBF(DBF_DEBUG, "ioctl ICARSACRT rc=%d\n", rc);
+		return rc;
+	}
+	return put_user(crt.outputdatalength, &ucrt->outputdatalength);
+}
+
+static int zsecsendcprb_ioctl(struct ap_perms *perms, unsigned long arg)
+{
+	int rc;
+	struct ica_xcRB xcRB;
+	struct ica_xcRB __user *uxcRB = (void __user *) arg;
+
+	if (copy_from_user(&xcRB, uxcRB, sizeof(xcRB)))
+		return -EFAULT;
+	do {
+		rc = _zcrypt_send_cprb(perms, &xcRB);
+	} while (rc == -EAGAIN);
+	/* on failure: retry once again after a requested rescan */
+	if ((rc == -ENODEV) && (zcrypt_process_rescan()))
+		do {
+			rc = _zcrypt_send_cprb(perms, &xcRB);
+		} while (rc == -EAGAIN);
+	if (rc)
+		ZCRYPT_DBF(DBF_DEBUG, "ioctl ZSENDCPRB rc=%d status=0x%x\n",
+			   rc, xcRB.status);
+	if (copy_to_user(uxcRB, &xcRB, sizeof(xcRB)))
+		return -EFAULT;
+	return rc;
+}
+
+static int zsendep11cprb_ioctl(struct ap_perms *perms, unsigned long arg)
+{
+	int rc;
+	struct ep11_urb xcrb;
+	struct ep11_urb __user *uxcrb = (void __user *)arg;
+
+	if (copy_from_user(&xcrb, uxcrb, sizeof(xcrb)))
+		return -EFAULT;
+	do {
+		rc = _zcrypt_send_ep11_cprb(perms, &xcrb);
+	} while (rc == -EAGAIN);
+	/* on failure: retry once again after a requested rescan */
+	if ((rc == -ENODEV) && (zcrypt_process_rescan()))
+		do {
+			rc = _zcrypt_send_ep11_cprb(perms, &xcrb);
+		} while (rc == -EAGAIN);
+	if (rc)
+		ZCRYPT_DBF(DBF_DEBUG, "ioctl ZSENDEP11CPRB rc=%d\n", rc);
+	if (copy_to_user(uxcrb, &xcrb, sizeof(xcrb)))
+		return -EFAULT;
+	return rc;
+}
+
 static long zcrypt_unlocked_ioctl(struct file *filp, unsigned int cmd,
 				  unsigned long arg)
 {
@@ -1310,87 +1403,14 @@ static long zcrypt_unlocked_ioctl(struct file *filp, unsigned int cmd,
 		return rc;
 
 	switch (cmd) {
-	case ICARSAMODEXPO: {
-		struct ica_rsa_modexpo __user *umex = (void __user *) arg;
-		struct ica_rsa_modexpo mex;
-
-		if (copy_from_user(&mex, umex, sizeof(mex)))
-			return -EFAULT;
-		do {
-			rc = zcrypt_rsa_modexpo(perms, &mex);
-		} while (rc == -EAGAIN);
-		/* on failure: retry once again after a requested rescan */
-		if ((rc == -ENODEV) && (zcrypt_process_rescan()))
-			do {
-				rc = zcrypt_rsa_modexpo(perms, &mex);
-			} while (rc == -EAGAIN);
-		if (rc) {
-			ZCRYPT_DBF(DBF_DEBUG, "ioctl ICARSAMODEXPO rc=%d\n", rc);
-			return rc;
-		}
-		return put_user(mex.outputdatalength, &umex->outputdatalength);
-	}
-	case ICARSACRT: {
-		struct ica_rsa_modexpo_crt __user *ucrt = (void __user *) arg;
-		struct ica_rsa_modexpo_crt crt;
-
-		if (copy_from_user(&crt, ucrt, sizeof(crt)))
-			return -EFAULT;
-		do {
-			rc = zcrypt_rsa_crt(perms, &crt);
-		} while (rc == -EAGAIN);
-		/* on failure: retry once again after a requested rescan */
-		if ((rc == -ENODEV) && (zcrypt_process_rescan()))
-			do {
-				rc = zcrypt_rsa_crt(perms, &crt);
-			} while (rc == -EAGAIN);
-		if (rc) {
-			ZCRYPT_DBF(DBF_DEBUG, "ioctl ICARSACRT rc=%d\n", rc);
-			return rc;
-		}
-		return put_user(crt.outputdatalength, &ucrt->outputdatalength);
-	}
-	case ZSECSENDCPRB: {
-		struct ica_xcRB __user *uxcRB = (void __user *) arg;
-		struct ica_xcRB xcRB;
-
-		if (copy_from_user(&xcRB, uxcRB, sizeof(xcRB)))
-			return -EFAULT;
-		do {
-			rc = _zcrypt_send_cprb(perms, &xcRB);
-		} while (rc == -EAGAIN);
-		/* on failure: retry once again after a requested rescan */
-		if ((rc == -ENODEV) && (zcrypt_process_rescan()))
-			do {
-				rc = _zcrypt_send_cprb(perms, &xcRB);
-			} while (rc == -EAGAIN);
-		if (rc)
-			ZCRYPT_DBF(DBF_DEBUG, "ioctl ZSENDCPRB rc=%d status=0x%x\n",
-				   rc, xcRB.status);
-		if (copy_to_user(uxcRB, &xcRB, sizeof(xcRB)))
-			return -EFAULT;
-		return rc;
-	}
-	case ZSENDEP11CPRB: {
-		struct ep11_urb __user *uxcrb = (void __user *)arg;
-		struct ep11_urb xcrb;
-
-		if (copy_from_user(&xcrb, uxcrb, sizeof(xcrb)))
-			return -EFAULT;
-		do {
-			rc = _zcrypt_send_ep11_cprb(perms, &xcrb);
-		} while (rc == -EAGAIN);
-		/* on failure: retry once again after a requested rescan */
-		if ((rc == -ENODEV) && (zcrypt_process_rescan()))
-			do {
-				rc = _zcrypt_send_ep11_cprb(perms, &xcrb);
-			} while (rc == -EAGAIN);
-		if (rc)
-			ZCRYPT_DBF(DBF_DEBUG, "ioctl ZSENDEP11CPRB rc=%d\n", rc);
-		if (copy_to_user(uxcrb, &xcrb, sizeof(xcrb)))
-			return -EFAULT;
-		return rc;
-	}
+	case ICARSAMODEXPO:
+		return icarsamodexpo_ioctl(perms, arg);
+	case ICARSACRT:
+		return icarsacrt_ioctl(perms, arg);
+	case ZSECSENDCPRB:
+		return zsecsendcprb_ioctl(perms, arg);
+	case ZSENDEP11CPRB:
+		return zsendep11cprb_ioctl(perms, arg);
 	case ZCRYPT_DEVICE_STATUS: {
 		struct zcrypt_device_status_ext *device_status;
 		size_t total_size = MAX_ZDEV_ENTRIES_EXT

From dc4b6ded3c17ebe1d7532943192b2308c031c43b Mon Sep 17 00:00:00 2001
From: Harald Freudenberger <freude@linux.ibm.com>
Date: Tue, 26 May 2020 10:49:33 +0200
Subject: [PATCH 257/502] s390/ap: rename and clarify ap state machine related
 stuff

There is a state machine held for each ap queue device.
The states and functions related to this where somethimes
noted with _sm_ somethimes without. This patch clarifies
and renames all the ap queue state machine related functions,
enums and defines to have a _sm_ in the name.

There is no functional change coming with this patch - it's
only beautifying code.

Signed-off-by: Harald Freudenberger <freude@linux.ibm.com>
Signed-off-by: Heiko Carstens <heiko.carstens@de.ibm.com>
---
 drivers/s390/crypto/ap_bus.c   |  18 +--
 drivers/s390/crypto/ap_bus.h   |  58 +++++-----
 drivers/s390/crypto/ap_queue.c | 200 ++++++++++++++++-----------------
 3 files changed, 138 insertions(+), 138 deletions(-)

diff --git a/drivers/s390/crypto/ap_bus.c b/drivers/s390/crypto/ap_bus.c
index e71ca4a719a5..64fa66788194 100644
--- a/drivers/s390/crypto/ap_bus.c
+++ b/drivers/s390/crypto/ap_bus.c
@@ -342,13 +342,13 @@ static int ap_query_queue(ap_qid_t qid, int *queue_depth, int *device_type,
 	}
 }
 
-void ap_wait(enum ap_wait wait)
+void ap_wait(enum ap_sm_wait wait)
 {
 	ktime_t hr_time;
 
 	switch (wait) {
-	case AP_WAIT_AGAIN:
-	case AP_WAIT_INTERRUPT:
+	case AP_SM_WAIT_AGAIN:
+	case AP_SM_WAIT_INTERRUPT:
 		if (ap_using_interrupts())
 			break;
 		if (ap_poll_kthread) {
@@ -356,7 +356,7 @@ void ap_wait(enum ap_wait wait)
 			break;
 		}
 		fallthrough;
-	case AP_WAIT_TIMEOUT:
+	case AP_SM_WAIT_TIMEOUT:
 		spin_lock_bh(&ap_poll_timer_lock);
 		if (!hrtimer_is_queued(&ap_poll_timer)) {
 			hr_time = poll_timeout;
@@ -365,7 +365,7 @@ void ap_wait(enum ap_wait wait)
 		}
 		spin_unlock_bh(&ap_poll_timer_lock);
 		break;
-	case AP_WAIT_NONE:
+	case AP_SM_WAIT_NONE:
 	default:
 		break;
 	}
@@ -382,7 +382,7 @@ void ap_request_timeout(struct timer_list *t)
 	struct ap_queue *aq = from_timer(aq, t, timeout);
 
 	spin_lock_bh(&aq->lock);
-	ap_wait(ap_sm_event(aq, AP_EVENT_TIMEOUT));
+	ap_wait(ap_sm_event(aq, AP_SM_EVENT_TIMEOUT));
 	spin_unlock_bh(&aq->lock);
 }
 
@@ -418,7 +418,7 @@ static void ap_tasklet_fn(unsigned long dummy)
 {
 	int bkt;
 	struct ap_queue *aq;
-	enum ap_wait wait = AP_WAIT_NONE;
+	enum ap_sm_wait wait = AP_SM_WAIT_NONE;
 
 	/* Reset the indicator if interrupts are used. Thus new interrupts can
 	 * be received. Doing it in the beginning of the tasklet is therefor
@@ -430,7 +430,7 @@ static void ap_tasklet_fn(unsigned long dummy)
 	spin_lock_bh(&ap_queues_lock);
 	hash_for_each(ap_queues, bkt, aq, hnode) {
 		spin_lock_bh(&aq->lock);
-		wait = min(wait, ap_sm_event_loop(aq, AP_EVENT_POLL));
+		wait = min(wait, ap_sm_event_loop(aq, AP_SM_EVENT_POLL));
 		spin_unlock_bh(&aq->lock);
 	}
 	spin_unlock_bh(&ap_queues_lock);
@@ -1370,7 +1370,7 @@ static void _ap_scan_bus_adapter(int id)
 				borked = 1;
 			else {
 				spin_lock_bh(&aq->lock);
-				borked = aq->state == AP_STATE_BORKED;
+				borked = aq->sm_state == AP_SM_STATE_BORKED;
 				spin_unlock_bh(&aq->lock);
 			}
 			if (borked) {
diff --git a/drivers/s390/crypto/ap_bus.h b/drivers/s390/crypto/ap_bus.h
index 69432e93643a..1a1d5e3c8d45 100644
--- a/drivers/s390/crypto/ap_bus.h
+++ b/drivers/s390/crypto/ap_bus.h
@@ -83,39 +83,39 @@ static inline int ap_test_bit(unsigned int *ptr, unsigned int nr)
 #define AP_INTR_ENABLED		1	/* AP interrupt enabled */
 
 /*
- * AP device states
+ * AP queue state machine states
  */
-enum ap_state {
-	AP_STATE_RESET_START,
-	AP_STATE_RESET_WAIT,
-	AP_STATE_SETIRQ_WAIT,
-	AP_STATE_IDLE,
-	AP_STATE_WORKING,
-	AP_STATE_QUEUE_FULL,
-	AP_STATE_REMOVE,	/* about to be removed from driver */
-	AP_STATE_UNBOUND,	/* momentary not bound to a driver */
-	AP_STATE_BORKED,	/* broken */
-	NR_AP_STATES
+enum ap_sm_state {
+	AP_SM_STATE_RESET_START,
+	AP_SM_STATE_RESET_WAIT,
+	AP_SM_STATE_SETIRQ_WAIT,
+	AP_SM_STATE_IDLE,
+	AP_SM_STATE_WORKING,
+	AP_SM_STATE_QUEUE_FULL,
+	AP_SM_STATE_REMOVE,	/* about to be removed from driver */
+	AP_SM_STATE_UNBOUND,	/* momentary not bound to a driver */
+	AP_SM_STATE_BORKED,	/* broken */
+	NR_AP_SM_STATES
 };
 
 /*
- * AP device events
+ * AP queue state machine events
  */
-enum ap_event {
-	AP_EVENT_POLL,
-	AP_EVENT_TIMEOUT,
-	NR_AP_EVENTS
+enum ap_sm_event {
+	AP_SM_EVENT_POLL,
+	AP_SM_EVENT_TIMEOUT,
+	NR_AP_SM_EVENTS
 };
 
 /*
- * AP wait behaviour
+ * AP queue state wait behaviour
  */
-enum ap_wait {
-	AP_WAIT_AGAIN,		/* retry immediately */
-	AP_WAIT_TIMEOUT,	/* wait for timeout */
-	AP_WAIT_INTERRUPT,	/* wait for thin interrupt (if available) */
-	AP_WAIT_NONE,		/* no wait */
-	NR_AP_WAIT
+enum ap_sm_wait {
+	AP_SM_WAIT_AGAIN,	/* retry immediately */
+	AP_SM_WAIT_TIMEOUT,	/* wait for timeout */
+	AP_SM_WAIT_INTERRUPT,	/* wait for thin interrupt (if available) */
+	AP_SM_WAIT_NONE,	/* no wait */
+	NR_AP_SM_WAIT
 };
 
 struct ap_device;
@@ -172,7 +172,7 @@ struct ap_queue {
 	ap_qid_t qid;			/* AP queue id. */
 	int interrupt;			/* indicate if interrupts are enabled */
 	int queue_count;		/* # messages currently on AP queue. */
-	enum ap_state state;		/* State of the AP device. */
+	enum ap_sm_state sm_state;	/* ap queue state machine state */
 	int pendingq_count;		/* # requests on pendingq list. */
 	int requestq_count;		/* # requests on requestq list. */
 	u64 total_request_count;	/* # requests ever for this AP device.*/
@@ -185,7 +185,7 @@ struct ap_queue {
 
 #define to_ap_queue(x) container_of((x), struct ap_queue, ap_dev.device)
 
-typedef enum ap_wait (ap_func_t)(struct ap_queue *queue);
+typedef enum ap_sm_wait (ap_func_t)(struct ap_queue *queue);
 
 struct ap_message {
 	struct list_head list;		/* Request queueing. */
@@ -231,15 +231,15 @@ static inline void ap_release_message(struct ap_message *ap_msg)
 int ap_send(ap_qid_t, unsigned long long, void *, size_t);
 int ap_recv(ap_qid_t, unsigned long long *, void *, size_t);
 
-enum ap_wait ap_sm_event(struct ap_queue *aq, enum ap_event event);
-enum ap_wait ap_sm_event_loop(struct ap_queue *aq, enum ap_event event);
+enum ap_sm_wait ap_sm_event(struct ap_queue *aq, enum ap_sm_event event);
+enum ap_sm_wait ap_sm_event_loop(struct ap_queue *aq, enum ap_sm_event event);
 
 void ap_queue_message(struct ap_queue *aq, struct ap_message *ap_msg);
 void ap_cancel_message(struct ap_queue *aq, struct ap_message *ap_msg);
 void ap_flush_queue(struct ap_queue *aq);
 
 void *ap_airq_ptr(void);
-void ap_wait(enum ap_wait wait);
+void ap_wait(enum ap_sm_wait wait);
 void ap_request_timeout(struct timer_list *t);
 void ap_bus_force_rescan(void);
 
diff --git a/drivers/s390/crypto/ap_queue.c b/drivers/s390/crypto/ap_queue.c
index d6cc384f294b..688ebebbf98c 100644
--- a/drivers/s390/crypto/ap_queue.c
+++ b/drivers/s390/crypto/ap_queue.c
@@ -119,9 +119,9 @@ EXPORT_SYMBOL(ap_recv);
 
 /* State machine definitions and helpers */
 
-static enum ap_wait ap_sm_nop(struct ap_queue *aq)
+static enum ap_sm_wait ap_sm_nop(struct ap_queue *aq)
 {
-	return AP_WAIT_NONE;
+	return AP_SM_WAIT_NONE;
 }
 
 /**
@@ -129,7 +129,7 @@ static enum ap_wait ap_sm_nop(struct ap_queue *aq)
  *	not change the state of the device.
  * @aq: pointer to the AP queue
  *
- * Returns AP_WAIT_NONE, AP_WAIT_AGAIN, or AP_WAIT_INTERRUPT
+ * Returns AP_SM_WAIT_NONE, AP_SM_WAIT_AGAIN, or AP_SM_WAIT_INTERRUPT
  */
 static struct ap_queue_status ap_sm_recv(struct ap_queue *aq)
 {
@@ -172,31 +172,31 @@ static struct ap_queue_status ap_sm_recv(struct ap_queue *aq)
  * ap_sm_read(): Receive pending reply messages from an AP queue.
  * @aq: pointer to the AP queue
  *
- * Returns AP_WAIT_NONE, AP_WAIT_AGAIN, or AP_WAIT_INTERRUPT
+ * Returns AP_SM_WAIT_NONE, AP_SM_WAIT_AGAIN, or AP_SM_WAIT_INTERRUPT
  */
-static enum ap_wait ap_sm_read(struct ap_queue *aq)
+static enum ap_sm_wait ap_sm_read(struct ap_queue *aq)
 {
 	struct ap_queue_status status;
 
 	if (!aq->reply)
-		return AP_WAIT_NONE;
+		return AP_SM_WAIT_NONE;
 	status = ap_sm_recv(aq);
 	switch (status.response_code) {
 	case AP_RESPONSE_NORMAL:
 		if (aq->queue_count > 0) {
-			aq->state = AP_STATE_WORKING;
-			return AP_WAIT_AGAIN;
+			aq->sm_state = AP_SM_STATE_WORKING;
+			return AP_SM_WAIT_AGAIN;
 		}
-		aq->state = AP_STATE_IDLE;
-		return AP_WAIT_NONE;
+		aq->sm_state = AP_SM_STATE_IDLE;
+		return AP_SM_WAIT_NONE;
 	case AP_RESPONSE_NO_PENDING_REPLY:
 		if (aq->queue_count > 0)
-			return AP_WAIT_INTERRUPT;
-		aq->state = AP_STATE_IDLE;
-		return AP_WAIT_NONE;
+			return AP_SM_WAIT_INTERRUPT;
+		aq->sm_state = AP_SM_STATE_IDLE;
+		return AP_SM_WAIT_NONE;
 	default:
-		aq->state = AP_STATE_BORKED;
-		return AP_WAIT_NONE;
+		aq->sm_state = AP_SM_STATE_BORKED;
+		return AP_SM_WAIT_NONE;
 	}
 }
 
@@ -204,15 +204,15 @@ static enum ap_wait ap_sm_read(struct ap_queue *aq)
  * ap_sm_write(): Send messages from the request queue to an AP queue.
  * @aq: pointer to the AP queue
  *
- * Returns AP_WAIT_NONE, AP_WAIT_AGAIN, or AP_WAIT_INTERRUPT
+ * Returns AP_SM_WAIT_NONE, AP_SM_WAIT_AGAIN, or AP_SM_WAIT_INTERRUPT
  */
-static enum ap_wait ap_sm_write(struct ap_queue *aq)
+static enum ap_sm_wait ap_sm_write(struct ap_queue *aq)
 {
 	struct ap_queue_status status;
 	struct ap_message *ap_msg;
 
 	if (aq->requestq_count <= 0)
-		return AP_WAIT_NONE;
+		return AP_SM_WAIT_NONE;
 	/* Start the next request on the queue. */
 	ap_msg = list_entry(aq->requestq.next, struct ap_message, list);
 	status = __ap_send(aq->qid, ap_msg->psmid,
@@ -227,26 +227,26 @@ static enum ap_wait ap_sm_write(struct ap_queue *aq)
 		aq->requestq_count--;
 		aq->pendingq_count++;
 		if (aq->queue_count < aq->card->queue_depth) {
-			aq->state = AP_STATE_WORKING;
-			return AP_WAIT_AGAIN;
+			aq->sm_state = AP_SM_STATE_WORKING;
+			return AP_SM_WAIT_AGAIN;
 		}
 		fallthrough;
 	case AP_RESPONSE_Q_FULL:
-		aq->state = AP_STATE_QUEUE_FULL;
-		return AP_WAIT_INTERRUPT;
+		aq->sm_state = AP_SM_STATE_QUEUE_FULL;
+		return AP_SM_WAIT_INTERRUPT;
 	case AP_RESPONSE_RESET_IN_PROGRESS:
-		aq->state = AP_STATE_RESET_WAIT;
-		return AP_WAIT_TIMEOUT;
+		aq->sm_state = AP_SM_STATE_RESET_WAIT;
+		return AP_SM_WAIT_TIMEOUT;
 	case AP_RESPONSE_MESSAGE_TOO_BIG:
 	case AP_RESPONSE_REQ_FAC_NOT_INST:
 		list_del_init(&ap_msg->list);
 		aq->requestq_count--;
 		ap_msg->rc = -EINVAL;
 		ap_msg->receive(aq, ap_msg, NULL);
-		return AP_WAIT_AGAIN;
+		return AP_SM_WAIT_AGAIN;
 	default:
-		aq->state = AP_STATE_BORKED;
-		return AP_WAIT_NONE;
+		aq->sm_state = AP_SM_STATE_BORKED;
+		return AP_SM_WAIT_NONE;
 	}
 }
 
@@ -254,9 +254,9 @@ static enum ap_wait ap_sm_write(struct ap_queue *aq)
  * ap_sm_read_write(): Send and receive messages to/from an AP queue.
  * @aq: pointer to the AP queue
  *
- * Returns AP_WAIT_NONE, AP_WAIT_AGAIN, or AP_WAIT_INTERRUPT
+ * Returns AP_SM_WAIT_NONE, AP_SM_WAIT_AGAIN, or AP_SM_WAIT_INTERRUPT
  */
-static enum ap_wait ap_sm_read_write(struct ap_queue *aq)
+static enum ap_sm_wait ap_sm_read_write(struct ap_queue *aq)
 {
 	return min(ap_sm_read(aq), ap_sm_write(aq));
 }
@@ -267,7 +267,7 @@ static enum ap_wait ap_sm_read_write(struct ap_queue *aq)
  *
  * Submit the Reset command to an AP queue.
  */
-static enum ap_wait ap_sm_reset(struct ap_queue *aq)
+static enum ap_sm_wait ap_sm_reset(struct ap_queue *aq)
 {
 	struct ap_queue_status status;
 
@@ -275,17 +275,17 @@ static enum ap_wait ap_sm_reset(struct ap_queue *aq)
 	switch (status.response_code) {
 	case AP_RESPONSE_NORMAL:
 	case AP_RESPONSE_RESET_IN_PROGRESS:
-		aq->state = AP_STATE_RESET_WAIT;
+		aq->sm_state = AP_SM_STATE_RESET_WAIT;
 		aq->interrupt = AP_INTR_DISABLED;
-		return AP_WAIT_TIMEOUT;
+		return AP_SM_WAIT_TIMEOUT;
 	case AP_RESPONSE_BUSY:
-		return AP_WAIT_TIMEOUT;
+		return AP_SM_WAIT_TIMEOUT;
 	case AP_RESPONSE_Q_NOT_AVAIL:
 	case AP_RESPONSE_DECONFIGURED:
 	case AP_RESPONSE_CHECKSTOPPED:
 	default:
-		aq->state = AP_STATE_BORKED;
-		return AP_WAIT_NONE;
+		aq->sm_state = AP_SM_STATE_BORKED;
+		return AP_SM_WAIT_NONE;
 	}
 }
 
@@ -295,7 +295,7 @@ static enum ap_wait ap_sm_reset(struct ap_queue *aq)
  *
  * Returns AP_POLL_IMMEDIATELY, AP_POLL_AFTER_TIMEROUT or 0.
  */
-static enum ap_wait ap_sm_reset_wait(struct ap_queue *aq)
+static enum ap_sm_wait ap_sm_reset_wait(struct ap_queue *aq)
 {
 	struct ap_queue_status status;
 	void *lsi_ptr;
@@ -311,20 +311,20 @@ static enum ap_wait ap_sm_reset_wait(struct ap_queue *aq)
 	case AP_RESPONSE_NORMAL:
 		lsi_ptr = ap_airq_ptr();
 		if (lsi_ptr && ap_queue_enable_interruption(aq, lsi_ptr) == 0)
-			aq->state = AP_STATE_SETIRQ_WAIT;
+			aq->sm_state = AP_SM_STATE_SETIRQ_WAIT;
 		else
-			aq->state = (aq->queue_count > 0) ?
-				AP_STATE_WORKING : AP_STATE_IDLE;
-		return AP_WAIT_AGAIN;
+			aq->sm_state = (aq->queue_count > 0) ?
+				AP_SM_STATE_WORKING : AP_SM_STATE_IDLE;
+		return AP_SM_WAIT_AGAIN;
 	case AP_RESPONSE_BUSY:
 	case AP_RESPONSE_RESET_IN_PROGRESS:
-		return AP_WAIT_TIMEOUT;
+		return AP_SM_WAIT_TIMEOUT;
 	case AP_RESPONSE_Q_NOT_AVAIL:
 	case AP_RESPONSE_DECONFIGURED:
 	case AP_RESPONSE_CHECKSTOPPED:
 	default:
-		aq->state = AP_STATE_BORKED;
-		return AP_WAIT_NONE;
+		aq->sm_state = AP_SM_STATE_BORKED;
+		return AP_SM_WAIT_NONE;
 	}
 }
 
@@ -334,7 +334,7 @@ static enum ap_wait ap_sm_reset_wait(struct ap_queue *aq)
  *
  * Returns AP_POLL_IMMEDIATELY, AP_POLL_AFTER_TIMEROUT or 0.
  */
-static enum ap_wait ap_sm_setirq_wait(struct ap_queue *aq)
+static enum ap_sm_wait ap_sm_setirq_wait(struct ap_queue *aq)
 {
 	struct ap_queue_status status;
 
@@ -348,75 +348,75 @@ static enum ap_wait ap_sm_setirq_wait(struct ap_queue *aq)
 	if (status.irq_enabled == 1) {
 		/* Irqs are now enabled */
 		aq->interrupt = AP_INTR_ENABLED;
-		aq->state = (aq->queue_count > 0) ?
-			AP_STATE_WORKING : AP_STATE_IDLE;
+		aq->sm_state = (aq->queue_count > 0) ?
+			AP_SM_STATE_WORKING : AP_SM_STATE_IDLE;
 	}
 
 	switch (status.response_code) {
 	case AP_RESPONSE_NORMAL:
 		if (aq->queue_count > 0)
-			return AP_WAIT_AGAIN;
+			return AP_SM_WAIT_AGAIN;
 		fallthrough;
 	case AP_RESPONSE_NO_PENDING_REPLY:
-		return AP_WAIT_TIMEOUT;
+		return AP_SM_WAIT_TIMEOUT;
 	default:
-		aq->state = AP_STATE_BORKED;
-		return AP_WAIT_NONE;
+		aq->sm_state = AP_SM_STATE_BORKED;
+		return AP_SM_WAIT_NONE;
 	}
 }
 
 /*
  * AP state machine jump table
  */
-static ap_func_t *ap_jumptable[NR_AP_STATES][NR_AP_EVENTS] = {
-	[AP_STATE_RESET_START] = {
-		[AP_EVENT_POLL] = ap_sm_reset,
-		[AP_EVENT_TIMEOUT] = ap_sm_nop,
+static ap_func_t *ap_jumptable[NR_AP_SM_STATES][NR_AP_SM_EVENTS] = {
+	[AP_SM_STATE_RESET_START] = {
+		[AP_SM_EVENT_POLL] = ap_sm_reset,
+		[AP_SM_EVENT_TIMEOUT] = ap_sm_nop,
 	},
-	[AP_STATE_RESET_WAIT] = {
-		[AP_EVENT_POLL] = ap_sm_reset_wait,
-		[AP_EVENT_TIMEOUT] = ap_sm_nop,
+	[AP_SM_STATE_RESET_WAIT] = {
+		[AP_SM_EVENT_POLL] = ap_sm_reset_wait,
+		[AP_SM_EVENT_TIMEOUT] = ap_sm_nop,
 	},
-	[AP_STATE_SETIRQ_WAIT] = {
-		[AP_EVENT_POLL] = ap_sm_setirq_wait,
-		[AP_EVENT_TIMEOUT] = ap_sm_nop,
+	[AP_SM_STATE_SETIRQ_WAIT] = {
+		[AP_SM_EVENT_POLL] = ap_sm_setirq_wait,
+		[AP_SM_EVENT_TIMEOUT] = ap_sm_nop,
 	},
-	[AP_STATE_IDLE] = {
-		[AP_EVENT_POLL] = ap_sm_write,
-		[AP_EVENT_TIMEOUT] = ap_sm_nop,
+	[AP_SM_STATE_IDLE] = {
+		[AP_SM_EVENT_POLL] = ap_sm_write,
+		[AP_SM_EVENT_TIMEOUT] = ap_sm_nop,
 	},
-	[AP_STATE_WORKING] = {
-		[AP_EVENT_POLL] = ap_sm_read_write,
-		[AP_EVENT_TIMEOUT] = ap_sm_reset,
+	[AP_SM_STATE_WORKING] = {
+		[AP_SM_EVENT_POLL] = ap_sm_read_write,
+		[AP_SM_EVENT_TIMEOUT] = ap_sm_reset,
 	},
-	[AP_STATE_QUEUE_FULL] = {
-		[AP_EVENT_POLL] = ap_sm_read,
-		[AP_EVENT_TIMEOUT] = ap_sm_reset,
+	[AP_SM_STATE_QUEUE_FULL] = {
+		[AP_SM_EVENT_POLL] = ap_sm_read,
+		[AP_SM_EVENT_TIMEOUT] = ap_sm_reset,
 	},
-	[AP_STATE_REMOVE] = {
-		[AP_EVENT_POLL] = ap_sm_nop,
-		[AP_EVENT_TIMEOUT] = ap_sm_nop,
+	[AP_SM_STATE_REMOVE] = {
+		[AP_SM_EVENT_POLL] = ap_sm_nop,
+		[AP_SM_EVENT_TIMEOUT] = ap_sm_nop,
 	},
-	[AP_STATE_UNBOUND] = {
-		[AP_EVENT_POLL] = ap_sm_nop,
-		[AP_EVENT_TIMEOUT] = ap_sm_nop,
+	[AP_SM_STATE_UNBOUND] = {
+		[AP_SM_EVENT_POLL] = ap_sm_nop,
+		[AP_SM_EVENT_TIMEOUT] = ap_sm_nop,
 	},
-	[AP_STATE_BORKED] = {
-		[AP_EVENT_POLL] = ap_sm_nop,
-		[AP_EVENT_TIMEOUT] = ap_sm_nop,
+	[AP_SM_STATE_BORKED] = {
+		[AP_SM_EVENT_POLL] = ap_sm_nop,
+		[AP_SM_EVENT_TIMEOUT] = ap_sm_nop,
 	},
 };
 
-enum ap_wait ap_sm_event(struct ap_queue *aq, enum ap_event event)
+enum ap_sm_wait ap_sm_event(struct ap_queue *aq, enum ap_sm_event event)
 {
-	return ap_jumptable[aq->state][event](aq);
+	return ap_jumptable[aq->sm_state][event](aq);
 }
 
-enum ap_wait ap_sm_event_loop(struct ap_queue *aq, enum ap_event event)
+enum ap_sm_wait ap_sm_event_loop(struct ap_queue *aq, enum ap_sm_event event)
 {
-	enum ap_wait wait;
+	enum ap_sm_wait wait;
 
-	while ((wait = ap_sm_event(aq, event)) == AP_WAIT_AGAIN)
+	while ((wait = ap_sm_event(aq, event)) == AP_SM_WAIT_AGAIN)
 		;
 	return wait;
 }
@@ -487,13 +487,13 @@ static ssize_t reset_show(struct device *dev,
 	int rc = 0;
 
 	spin_lock_bh(&aq->lock);
-	switch (aq->state) {
-	case AP_STATE_RESET_START:
-	case AP_STATE_RESET_WAIT:
+	switch (aq->sm_state) {
+	case AP_SM_STATE_RESET_START:
+	case AP_SM_STATE_RESET_WAIT:
 		rc = scnprintf(buf, PAGE_SIZE, "Reset in progress.\n");
 		break;
-	case AP_STATE_WORKING:
-	case AP_STATE_QUEUE_FULL:
+	case AP_SM_STATE_WORKING:
+	case AP_SM_STATE_QUEUE_FULL:
 		rc = scnprintf(buf, PAGE_SIZE, "Reset Timer armed.\n");
 		break;
 	default:
@@ -511,8 +511,8 @@ static ssize_t reset_store(struct device *dev,
 
 	spin_lock_bh(&aq->lock);
 	__ap_flush_queue(aq);
-	aq->state = AP_STATE_RESET_START;
-	ap_wait(ap_sm_event(aq, AP_EVENT_POLL));
+	aq->sm_state = AP_SM_STATE_RESET_START;
+	ap_wait(ap_sm_event(aq, AP_SM_EVENT_POLL));
 	spin_unlock_bh(&aq->lock);
 
 	AP_DBF(DBF_INFO, "reset queue=%02x.%04x triggered by user\n",
@@ -530,7 +530,7 @@ static ssize_t interrupt_show(struct device *dev,
 	int rc = 0;
 
 	spin_lock_bh(&aq->lock);
-	if (aq->state == AP_STATE_SETIRQ_WAIT)
+	if (aq->sm_state == AP_SM_STATE_SETIRQ_WAIT)
 		rc = scnprintf(buf, PAGE_SIZE, "Enable Interrupt pending.\n");
 	else if (aq->interrupt == AP_INTR_ENABLED)
 		rc = scnprintf(buf, PAGE_SIZE, "Interrupts enabled.\n");
@@ -587,7 +587,7 @@ struct ap_queue *ap_queue_create(ap_qid_t qid, int device_type)
 	aq->ap_dev.device.type = &ap_queue_type;
 	aq->ap_dev.device_type = device_type;
 	aq->qid = qid;
-	aq->state = AP_STATE_UNBOUND;
+	aq->sm_state = AP_SM_STATE_UNBOUND;
 	aq->interrupt = AP_INTR_DISABLED;
 	spin_lock_init(&aq->lock);
 	INIT_LIST_HEAD(&aq->pendingq);
@@ -602,7 +602,7 @@ void ap_queue_init_reply(struct ap_queue *aq, struct ap_message *reply)
 	aq->reply = reply;
 
 	spin_lock_bh(&aq->lock);
-	ap_wait(ap_sm_event(aq, AP_EVENT_POLL));
+	ap_wait(ap_sm_event(aq, AP_SM_EVENT_POLL));
 	spin_unlock_bh(&aq->lock);
 }
 EXPORT_SYMBOL(ap_queue_init_reply);
@@ -626,7 +626,7 @@ void ap_queue_message(struct ap_queue *aq, struct ap_message *ap_msg)
 	aq->total_request_count++;
 	atomic64_inc(&aq->card->total_request_count);
 	/* Send/receive as many request from the queue as possible. */
-	ap_wait(ap_sm_event_loop(aq, AP_EVENT_POLL));
+	ap_wait(ap_sm_event_loop(aq, AP_SM_EVENT_POLL));
 	spin_unlock_bh(&aq->lock);
 }
 EXPORT_SYMBOL(ap_queue_message);
@@ -699,7 +699,7 @@ void ap_queue_prepare_remove(struct ap_queue *aq)
 	/* flush queue */
 	__ap_flush_queue(aq);
 	/* set REMOVE state to prevent new messages are queued in */
-	aq->state = AP_STATE_REMOVE;
+	aq->sm_state = AP_SM_STATE_REMOVE;
 	spin_unlock_bh(&aq->lock);
 	del_timer_sync(&aq->timeout);
 }
@@ -708,22 +708,22 @@ void ap_queue_remove(struct ap_queue *aq)
 {
 	/*
 	 * all messages have been flushed and the state is
-	 * AP_STATE_REMOVE. Now reset with zero which also
+	 * AP_SM_STATE_REMOVE. Now reset with zero which also
 	 * clears the irq registration and move the state
-	 * to AP_STATE_UNBOUND to signal that this queue
+	 * to AP_SM_STATE_UNBOUND to signal that this queue
 	 * is not used by any driver currently.
 	 */
 	spin_lock_bh(&aq->lock);
 	ap_zapq(aq->qid);
-	aq->state = AP_STATE_UNBOUND;
+	aq->sm_state = AP_SM_STATE_UNBOUND;
 	spin_unlock_bh(&aq->lock);
 }
 
 void ap_queue_init_state(struct ap_queue *aq)
 {
 	spin_lock_bh(&aq->lock);
-	aq->state = AP_STATE_RESET_START;
-	ap_wait(ap_sm_event(aq, AP_EVENT_POLL));
+	aq->sm_state = AP_SM_STATE_RESET_START;
+	ap_wait(ap_sm_event(aq, AP_SM_EVENT_POLL));
 	spin_unlock_bh(&aq->lock);
 }
 EXPORT_SYMBOL(ap_queue_init_state);

From a303e88743f6514995c31fe611011935ea7f040c Mon Sep 17 00:00:00 2001
From: Harald Freudenberger <freude@linux.ibm.com>
Date: Fri, 12 Jun 2020 10:13:23 +0200
Subject: [PATCH 258/502] s390/zcrypt: provide cex4 cca sysfs attributes for
 cex3

This patch introduces the sysfs attributes serialnr and
mkvps for cex2c and cex3c cards. These sysfs attributes
are available for cex4c and higher since
commit 7c4e91c0959b ("s390/zcrypt: new sysfs attributes serialnr and mkvps")'
and this patch now provides the same for the older cex2
and cex3 cards.

Signed-off-by: Harald Freudenberger <freude@linux.ibm.com>
Signed-off-by: Heiko Carstens <heiko.carstens@de.ibm.com>
---
 drivers/s390/crypto/zcrypt_cex2c.c | 114 +++++++++++++++++++++++++++++
 drivers/s390/crypto/zcrypt_cex4.c  |  26 +++++--
 2 files changed, 132 insertions(+), 8 deletions(-)

diff --git a/drivers/s390/crypto/zcrypt_cex2c.c b/drivers/s390/crypto/zcrypt_cex2c.c
index 993addb726e0..f00127a78bab 100644
--- a/drivers/s390/crypto/zcrypt_cex2c.c
+++ b/drivers/s390/crypto/zcrypt_cex2c.c
@@ -25,6 +25,7 @@
 #include "zcrypt_msgtype6.h"
 #include "zcrypt_cex2c.h"
 #include "zcrypt_cca_key.h"
+#include "zcrypt_ccamisc.h"
 
 #define CEX2C_MIN_MOD_SIZE	 16	/*  128 bits	*/
 #define CEX2C_MAX_MOD_SIZE	256	/* 2048 bits	*/
@@ -58,6 +59,91 @@ static struct ap_device_id zcrypt_cex2c_queue_ids[] = {
 
 MODULE_DEVICE_TABLE(ap, zcrypt_cex2c_queue_ids);
 
+/*
+ * CCA card additional device attributes
+ */
+static ssize_t cca_serialnr_show(struct device *dev,
+				 struct device_attribute *attr,
+				 char *buf)
+{
+	struct cca_info ci;
+	struct ap_card *ac = to_ap_card(dev);
+	struct zcrypt_card *zc = ac->private;
+
+	memset(&ci, 0, sizeof(ci));
+
+	if (ap_domain_index >= 0)
+		cca_get_info(ac->id, ap_domain_index, &ci, zc->online);
+
+	return scnprintf(buf, PAGE_SIZE, "%s\n", ci.serial);
+}
+
+static struct device_attribute dev_attr_cca_serialnr =
+	__ATTR(serialnr, 0444, cca_serialnr_show, NULL);
+
+static struct attribute *cca_card_attrs[] = {
+	&dev_attr_cca_serialnr.attr,
+	NULL,
+};
+
+static const struct attribute_group cca_card_attr_grp = {
+	.attrs = cca_card_attrs,
+};
+
+ /*
+  * CCA queue additional device attributes
+  */
+static ssize_t cca_mkvps_show(struct device *dev,
+			      struct device_attribute *attr,
+			      char *buf)
+{
+	int n = 0;
+	struct cca_info ci;
+	struct zcrypt_queue *zq = to_ap_queue(dev)->private;
+	static const char * const cao_state[] = { "invalid", "valid" };
+	static const char * const new_state[] = { "empty", "partial", "full" };
+
+	memset(&ci, 0, sizeof(ci));
+
+	cca_get_info(AP_QID_CARD(zq->queue->qid),
+		     AP_QID_QUEUE(zq->queue->qid),
+		     &ci, zq->online);
+
+	if (ci.new_mk_state >= '1' && ci.new_mk_state <= '3')
+		n = scnprintf(buf, PAGE_SIZE, "AES NEW: %s 0x%016llx\n",
+			      new_state[ci.new_mk_state - '1'], ci.new_mkvp);
+	else
+		n = scnprintf(buf, PAGE_SIZE, "AES NEW: - -\n");
+
+	if (ci.cur_mk_state >= '1' && ci.cur_mk_state <= '2')
+		n += scnprintf(buf + n, PAGE_SIZE - n,
+			       "AES CUR: %s 0x%016llx\n",
+			       cao_state[ci.cur_mk_state - '1'], ci.cur_mkvp);
+	else
+		n += scnprintf(buf + n, PAGE_SIZE - n, "AES CUR: - -\n");
+
+	if (ci.old_mk_state >= '1' && ci.old_mk_state <= '2')
+		n += scnprintf(buf + n, PAGE_SIZE - n,
+			       "AES OLD: %s 0x%016llx\n",
+			       cao_state[ci.old_mk_state - '1'], ci.old_mkvp);
+	else
+		n += scnprintf(buf + n, PAGE_SIZE - n, "AES OLD: - -\n");
+
+	return n;
+}
+
+static struct device_attribute dev_attr_cca_mkvps =
+	__ATTR(mkvps, 0444, cca_mkvps_show, NULL);
+
+static struct attribute *cca_queue_attrs[] = {
+	&dev_attr_cca_mkvps.attr,
+	NULL,
+};
+
+static const struct attribute_group cca_queue_attr_grp = {
+	.attrs = cca_queue_attrs,
+};
+
 /**
  * Large random number detection function. Its sends a message to a CEX2C/CEX3C
  * card to find out if large random numbers are supported.
@@ -178,6 +264,17 @@ static int zcrypt_cex2c_card_probe(struct ap_device *ap_dev)
 	if (rc) {
 		ac->private = NULL;
 		zcrypt_card_free(zc);
+		return rc;
+	}
+
+	if (ap_test_bit(&ac->functions, AP_FUNC_COPRO)) {
+		rc = sysfs_create_group(&ap_dev->device.kobj,
+					&cca_card_attr_grp);
+		if (rc) {
+			zcrypt_card_unregister(zc);
+			ac->private = NULL;
+			zcrypt_card_free(zc);
+		}
 	}
 
 	return rc;
@@ -189,8 +286,11 @@ static int zcrypt_cex2c_card_probe(struct ap_device *ap_dev)
  */
 static void zcrypt_cex2c_card_remove(struct ap_device *ap_dev)
 {
+	struct ap_card *ac = to_ap_card(&ap_dev->device);
 	struct zcrypt_card *zc = to_ap_card(&ap_dev->device)->private;
 
+	if (ap_test_bit(&ac->functions, AP_FUNC_COPRO))
+		sysfs_remove_group(&ap_dev->device.kobj, &cca_card_attr_grp);
 	if (zc)
 		zcrypt_card_unregister(zc);
 }
@@ -239,7 +339,19 @@ static int zcrypt_cex2c_queue_probe(struct ap_device *ap_dev)
 	if (rc) {
 		aq->private = NULL;
 		zcrypt_queue_free(zq);
+		return rc;
 	}
+
+	if (ap_test_bit(&aq->card->functions, AP_FUNC_COPRO)) {
+		rc = sysfs_create_group(&ap_dev->device.kobj,
+					&cca_queue_attr_grp);
+		if (rc) {
+			zcrypt_queue_unregister(zq);
+			aq->private = NULL;
+			zcrypt_queue_free(zq);
+		}
+	}
+
 	return rc;
 }
 
@@ -252,6 +364,8 @@ static void zcrypt_cex2c_queue_remove(struct ap_device *ap_dev)
 	struct ap_queue *aq = to_ap_queue(&ap_dev->device);
 	struct zcrypt_queue *zq = aq->private;
 
+	if (ap_test_bit(&aq->card->functions, AP_FUNC_COPRO))
+		sysfs_remove_group(&ap_dev->device.kobj, &cca_queue_attr_grp);
 	if (zq)
 		zcrypt_queue_unregister(zq);
 }
diff --git a/drivers/s390/crypto/zcrypt_cex4.c b/drivers/s390/crypto/zcrypt_cex4.c
index 337ec71ddb58..dc20d983e468 100644
--- a/drivers/s390/crypto/zcrypt_cex4.c
+++ b/drivers/s390/crypto/zcrypt_cex4.c
@@ -529,22 +529,27 @@ static int zcrypt_cex4_card_probe(struct ap_device *ap_dev)
 	if (rc) {
 		ac->private = NULL;
 		zcrypt_card_free(zc);
-		goto out;
+		return rc;
 	}
 
 	if (ap_test_bit(&ac->functions, AP_FUNC_COPRO)) {
 		rc = sysfs_create_group(&ap_dev->device.kobj,
 					&cca_card_attr_grp);
-		if (rc)
+		if (rc) {
 			zcrypt_card_unregister(zc);
+			ac->private = NULL;
+			zcrypt_card_free(zc);
+		}
 	} else if (ap_test_bit(&ac->functions, AP_FUNC_EP11)) {
 		rc = sysfs_create_group(&ap_dev->device.kobj,
 					&ep11_card_attr_grp);
-		if (rc)
+		if (rc) {
 			zcrypt_card_unregister(zc);
+			ac->private = NULL;
+			zcrypt_card_free(zc);
+		}
 	}
 
-out:
 	return rc;
 }
 
@@ -617,22 +622,27 @@ static int zcrypt_cex4_queue_probe(struct ap_device *ap_dev)
 	if (rc) {
 		aq->private = NULL;
 		zcrypt_queue_free(zq);
-		goto out;
+		return rc;
 	}
 
 	if (ap_test_bit(&aq->card->functions, AP_FUNC_COPRO)) {
 		rc = sysfs_create_group(&ap_dev->device.kobj,
 					&cca_queue_attr_grp);
-		if (rc)
+		if (rc) {
 			zcrypt_queue_unregister(zq);
+			aq->private = NULL;
+			zcrypt_queue_free(zq);
+		}
 	} else if (ap_test_bit(&aq->card->functions, AP_FUNC_EP11)) {
 		rc = sysfs_create_group(&ap_dev->device.kobj,
 					&ep11_queue_attr_grp);
-		if (rc)
+		if (rc) {
 			zcrypt_queue_unregister(zq);
+			aq->private = NULL;
+			zcrypt_queue_free(zq);
+		}
 	}
 
-out:
 	return rc;
 }
 

From bc67f10ad1d76a30e01c539c0043417fa34648d7 Mon Sep 17 00:00:00 2001
From: Anshuman Khandual <anshuman.khandual@arm.com>
Date: Fri, 3 Jul 2020 09:21:34 +0530
Subject: [PATCH 259/502] arm64/cpufeature: Add remaining feature bits in
 ID_AA64MMFR0 register

Enable EVC, FGT, EXS features bits in ID_AA64MMFR0 register as per ARM DDI
0487F.a specification.

Suggested-by: Will Deacon <will@kernel.org>
Signed-off-by: Anshuman Khandual <anshuman.khandual@arm.com>
Reviewed-by: Suzuki K Poulose <suzuki.poulose@arm.com>
Cc: Will Deacon <will@kernel.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Suzuki K Poulose <suzuki.poulose@arm.com>
Cc: linux-arm-kernel@lists.infradead.org
Cc: linux-kernel@vger.kernel.org
Link: https://lore.kernel.org/r/1593748297-1965-2-git-send-email-anshuman.khandual@arm.com
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 arch/arm64/include/asm/sysreg.h | 3 +++
 arch/arm64/kernel/cpufeature.c  | 3 +++
 2 files changed, 6 insertions(+)

diff --git a/arch/arm64/include/asm/sysreg.h b/arch/arm64/include/asm/sysreg.h
index 463175f80341..2e36dfde2570 100644
--- a/arch/arm64/include/asm/sysreg.h
+++ b/arch/arm64/include/asm/sysreg.h
@@ -706,6 +706,9 @@
 #define ID_AA64ZFR0_SVEVER_SVE2		0x1
 
 /* id_aa64mmfr0 */
+#define ID_AA64MMFR0_ECV_SHIFT		60
+#define ID_AA64MMFR0_FGT_SHIFT		56
+#define ID_AA64MMFR0_EXS_SHIFT		44
 #define ID_AA64MMFR0_TGRAN4_2_SHIFT	40
 #define ID_AA64MMFR0_TGRAN64_2_SHIFT	36
 #define ID_AA64MMFR0_TGRAN16_2_SHIFT	32
diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c
index 9f63053a63a9..7a84f5f31527 100644
--- a/arch/arm64/kernel/cpufeature.c
+++ b/arch/arm64/kernel/cpufeature.c
@@ -269,6 +269,9 @@ static const struct arm64_ftr_bits ftr_id_aa64zfr0[] = {
 };
 
 static const struct arm64_ftr_bits ftr_id_aa64mmfr0[] = {
+	ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR0_ECV_SHIFT, 4, 0),
+	ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR0_FGT_SHIFT, 4, 0),
+	ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR0_EXS_SHIFT, 4, 0),
 	/*
 	 * Page size not being supported at Stage-2 is not fatal. You
 	 * just give up KVM if PAGE_SIZE isn't supported there. Go fix

From 853772ba8023c25b1caae56b6426ca76dae1eaff Mon Sep 17 00:00:00 2001
From: Anshuman Khandual <anshuman.khandual@arm.com>
Date: Fri, 3 Jul 2020 09:21:35 +0530
Subject: [PATCH 260/502] arm64/cpufeature: Add remaining feature bits in
 ID_AA64MMFR1 register

Enable ETS, TWED, XNX and SPECSEI features bits in ID_AA64MMFR1 register as
per ARM DDI 0487F.a specification.

Suggested-by: Will Deacon <will@kernel.org>
Signed-off-by: Anshuman Khandual <anshuman.khandual@arm.com>
Reviewed-by: Suzuki K Poulose <suzuki.poulose@arm.com>
Cc: Will Deacon <will@kernel.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Suzuki K Poulose <suzuki.poulose@arm.com>
Cc: linux-arm-kernel@lists.infradead.org
Cc: linux-kernel@vger.kernel.org
Link: https://lore.kernel.org/r/1593748297-1965-3-git-send-email-anshuman.khandual@arm.com
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 arch/arm64/include/asm/sysreg.h | 4 ++++
 arch/arm64/kernel/cpufeature.c  | 4 ++++
 2 files changed, 8 insertions(+)

diff --git a/arch/arm64/include/asm/sysreg.h b/arch/arm64/include/asm/sysreg.h
index 2e36dfde2570..889fa7729719 100644
--- a/arch/arm64/include/asm/sysreg.h
+++ b/arch/arm64/include/asm/sysreg.h
@@ -737,6 +737,10 @@
 #endif
 
 /* id_aa64mmfr1 */
+#define ID_AA64MMFR1_ETS_SHIFT		36
+#define ID_AA64MMFR1_TWED_SHIFT		32
+#define ID_AA64MMFR1_XNX_SHIFT		28
+#define ID_AA64MMFR1_SPECSEI_SHIFT	24
 #define ID_AA64MMFR1_PAN_SHIFT		20
 #define ID_AA64MMFR1_LOR_SHIFT		16
 #define ID_AA64MMFR1_HPD_SHIFT		12
diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c
index 7a84f5f31527..764793c4a188 100644
--- a/arch/arm64/kernel/cpufeature.c
+++ b/arch/arm64/kernel/cpufeature.c
@@ -315,6 +315,10 @@ static const struct arm64_ftr_bits ftr_id_aa64mmfr0[] = {
 };
 
 static const struct arm64_ftr_bits ftr_id_aa64mmfr1[] = {
+	ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR1_ETS_SHIFT, 4, 0),
+	ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR1_TWED_SHIFT, 4, 0),
+	ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR1_XNX_SHIFT, 4, 0),
+	ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_HIGHER_SAFE, ID_AA64MMFR1_SPECSEI_SHIFT, 4, 0),
 	ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR1_PAN_SHIFT, 4, 0),
 	ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR1_LOR_SHIFT, 4, 0),
 	ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR1_HPD_SHIFT, 4, 0),

From 356fdfbe8761da55c4100bd543259f349fc1ca3a Mon Sep 17 00:00:00 2001
From: Anshuman Khandual <anshuman.khandual@arm.com>
Date: Fri, 3 Jul 2020 09:21:36 +0530
Subject: [PATCH 261/502] arm64/cpufeature: Add remaining feature bits in
 ID_AA64MMFR2 register

Enable EVT, BBM, TTL, IDS, ST, NV and CCIDX features bits in ID_AA64MMFR2
register as per ARM DDI 0487F.a specification.

Suggested-by: Will Deacon <will@kernel.org>
Signed-off-by: Anshuman Khandual <anshuman.khandual@arm.com>
Reviewed-by: Suzuki K Poulose <suzuki.poulose@arm.com>
Cc: Will Deacon <will@kernel.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Suzuki K Poulose <suzuki.poulose@arm.com>
Cc: linux-arm-kernel@lists.infradead.org
Cc: linux-kernel@vger.kernel.org
Link: https://lore.kernel.org/r/1593748297-1965-4-git-send-email-anshuman.khandual@arm.com
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 arch/arm64/include/asm/sysreg.h | 7 +++++++
 arch/arm64/kernel/cpufeature.c  | 7 +++++++
 2 files changed, 14 insertions(+)

diff --git a/arch/arm64/include/asm/sysreg.h b/arch/arm64/include/asm/sysreg.h
index 889fa7729719..9ee324936ea2 100644
--- a/arch/arm64/include/asm/sysreg.h
+++ b/arch/arm64/include/asm/sysreg.h
@@ -753,8 +753,15 @@
 
 /* id_aa64mmfr2 */
 #define ID_AA64MMFR2_E0PD_SHIFT		60
+#define ID_AA64MMFR2_EVT_SHIFT		56
+#define ID_AA64MMFR2_BBM_SHIFT		52
+#define ID_AA64MMFR2_TTL_SHIFT		48
 #define ID_AA64MMFR2_FWB_SHIFT		40
+#define ID_AA64MMFR2_IDS_SHIFT		36
 #define ID_AA64MMFR2_AT_SHIFT		32
+#define ID_AA64MMFR2_ST_SHIFT		28
+#define ID_AA64MMFR2_NV_SHIFT		24
+#define ID_AA64MMFR2_CCIDX_SHIFT	20
 #define ID_AA64MMFR2_LVA_SHIFT		16
 #define ID_AA64MMFR2_IESB_SHIFT		12
 #define ID_AA64MMFR2_LSM_SHIFT		8
diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c
index 764793c4a188..93797d9bb931 100644
--- a/arch/arm64/kernel/cpufeature.c
+++ b/arch/arm64/kernel/cpufeature.c
@@ -330,8 +330,15 @@ static const struct arm64_ftr_bits ftr_id_aa64mmfr1[] = {
 
 static const struct arm64_ftr_bits ftr_id_aa64mmfr2[] = {
 	ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64MMFR2_E0PD_SHIFT, 4, 0),
+	ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR2_EVT_SHIFT, 4, 0),
+	ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR2_BBM_SHIFT, 4, 0),
+	ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR2_TTL_SHIFT, 4, 0),
 	ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR2_FWB_SHIFT, 4, 0),
+	ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR2_IDS_SHIFT, 4, 0),
 	ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR2_AT_SHIFT, 4, 0),
+	ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR2_ST_SHIFT, 4, 0),
+	ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR2_NV_SHIFT, 4, 0),
+	ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR2_CCIDX_SHIFT, 4, 0),
 	ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR2_LVA_SHIFT, 4, 0),
 	ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64MMFR2_IESB_SHIFT, 4, 0),
 	ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR2_LSM_SHIFT, 4, 0),

From 8d3154afc10dd474265b62752cd169f66f40ae0d Mon Sep 17 00:00:00 2001
From: Anshuman Khandual <anshuman.khandual@arm.com>
Date: Fri, 3 Jul 2020 09:21:37 +0530
Subject: [PATCH 262/502] arm64/cpufeature: Replace all open bits shift
 encodings with macros

There are many open bits shift encodings for various CPU ID registers that
are scattered across cpufeature. This replaces them with register specific
sensible macro definitions. This should not have any functional change.

Signed-off-by: Anshuman Khandual <anshuman.khandual@arm.com>
Reviewed-by: Suzuki K Poulose <suzuki.poulose@arm.com>
Cc: Will Deacon <will@kernel.org>
Cc: Marc Zyngier <maz@kernel.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: James Morse <james.morse@arm.com>
Cc: Suzuki K Poulose <suzuki.poulose@arm.com>
Cc: linux-arm-kernel@lists.infradead.org
Cc: linux-kernel@vger.kernel.org
Link: https://lore.kernel.org/r/1593748297-1965-5-git-send-email-anshuman.khandual@arm.com
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 arch/arm64/include/asm/sysreg.h | 28 +++++++++++++++++
 arch/arm64/kernel/cpufeature.c  | 53 +++++++++++++++++----------------
 2 files changed, 55 insertions(+), 26 deletions(-)

diff --git a/arch/arm64/include/asm/sysreg.h b/arch/arm64/include/asm/sysreg.h
index 9ee324936ea2..b74c727c3bcd 100644
--- a/arch/arm64/include/asm/sysreg.h
+++ b/arch/arm64/include/asm/sysreg.h
@@ -769,6 +769,7 @@
 #define ID_AA64MMFR2_CNP_SHIFT		0
 
 /* id_aa64dfr0 */
+#define ID_AA64DFR0_DOUBLELOCK_SHIFT	36
 #define ID_AA64DFR0_PMSVER_SHIFT	32
 #define ID_AA64DFR0_CTX_CMPS_SHIFT	28
 #define ID_AA64DFR0_WRPS_SHIFT		20
@@ -821,18 +822,40 @@
 #define ID_ISAR6_DP_SHIFT		4
 #define ID_ISAR6_JSCVT_SHIFT		0
 
+#define ID_MMFR0_INNERSHR_SHIFT		28
+#define ID_MMFR0_FCSE_SHIFT		24
+#define ID_MMFR0_AUXREG_SHIFT		20
+#define ID_MMFR0_TCM_SHIFT		16
+#define ID_MMFR0_SHARELVL_SHIFT		12
+#define ID_MMFR0_OUTERSHR_SHIFT		8
+#define ID_MMFR0_PMSA_SHIFT		4
+#define ID_MMFR0_VMSA_SHIFT		0
+
 #define ID_MMFR4_EVT_SHIFT		28
 #define ID_MMFR4_CCIDX_SHIFT		24
 #define ID_MMFR4_LSM_SHIFT		20
 #define ID_MMFR4_HPDS_SHIFT		16
 #define ID_MMFR4_CNP_SHIFT		12
 #define ID_MMFR4_XNX_SHIFT		8
+#define ID_MMFR4_AC2_SHIFT		4
 #define ID_MMFR4_SPECSEI_SHIFT		0
 
 #define ID_MMFR5_ETS_SHIFT		0
 
 #define ID_PFR0_DIT_SHIFT		24
 #define ID_PFR0_CSV2_SHIFT		16
+#define ID_PFR0_STATE3_SHIFT		12
+#define ID_PFR0_STATE2_SHIFT		8
+#define ID_PFR0_STATE1_SHIFT		4
+#define ID_PFR0_STATE0_SHIFT		0
+
+#define ID_DFR0_PERFMON_SHIFT		24
+#define ID_DFR0_MPROFDBG_SHIFT		20
+#define ID_DFR0_MMAPTRC_SHIFT		16
+#define ID_DFR0_COPTRC_SHIFT		12
+#define ID_DFR0_MMAPDBG_SHIFT		8
+#define ID_DFR0_COPSDBG_SHIFT		4
+#define ID_DFR0_COPDBG_SHIFT		0
 
 #define ID_PFR2_SSBS_SHIFT		4
 #define ID_PFR2_CSV3_SHIFT		0
@@ -875,6 +898,11 @@
 #define ID_AA64MMFR0_TGRAN_SUPPORTED	ID_AA64MMFR0_TGRAN64_SUPPORTED
 #endif
 
+#define MVFR2_FPMISC_SHIFT		4
+#define MVFR2_SIMDMISC_SHIFT		0
+
+#define DCZID_DZP_SHIFT			4
+#define DCZID_BS_SHIFT			0
 
 /*
  * The ZCR_ELx_LEN_* definitions intentionally include bits [8:4] which
diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c
index 93797d9bb931..19146bd338b4 100644
--- a/arch/arm64/kernel/cpufeature.c
+++ b/arch/arm64/kernel/cpufeature.c
@@ -359,7 +359,7 @@ static const struct arm64_ftr_bits ftr_ctr[] = {
 	 * make use of *minLine.
 	 * If we have differing I-cache policies, report it as the weakest - VIPT.
 	 */
-	ARM64_FTR_BITS(FTR_VISIBLE, FTR_NONSTRICT, FTR_EXACT, 14, 2, ICACHE_POLICY_VIPT),	/* L1Ip */
+	ARM64_FTR_BITS(FTR_VISIBLE, FTR_NONSTRICT, FTR_EXACT, CTR_L1IP_SHIFT, 2, ICACHE_POLICY_VIPT),	/* L1Ip */
 	ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, CTR_IMINLINE_SHIFT, 4, 0),
 	ARM64_FTR_END,
 };
@@ -370,19 +370,19 @@ struct arm64_ftr_reg arm64_ftr_reg_ctrel0 = {
 };
 
 static const struct arm64_ftr_bits ftr_id_mmfr0[] = {
-	S_ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, 28, 4, 0xf),	/* InnerShr */
-	ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, 24, 4, 0),	/* FCSE */
-	ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, 20, 4, 0),	/* AuxReg */
-	ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, 16, 4, 0),	/* TCM */
-	ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, 12, 4, 0),	/* ShareLvl */
-	S_ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, 8, 4, 0xf),	/* OuterShr */
-	ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, 4, 4, 0),	/* PMSA */
-	ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, 0, 4, 0),	/* VMSA */
+	S_ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_MMFR0_INNERSHR_SHIFT, 4, 0xf),
+	ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_MMFR0_FCSE_SHIFT, 4, 0),
+	ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_MMFR0_AUXREG_SHIFT, 4, 0),
+	ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_MMFR0_TCM_SHIFT, 4, 0),
+	ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_MMFR0_SHARELVL_SHIFT, 4, 0),
+	S_ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_MMFR0_OUTERSHR_SHIFT, 4, 0xf),
+	ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_MMFR0_PMSA_SHIFT, 4, 0),
+	ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_MMFR0_VMSA_SHIFT, 4, 0),
 	ARM64_FTR_END,
 };
 
 static const struct arm64_ftr_bits ftr_id_aa64dfr0[] = {
-	S_ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, 36, 4, 0),
+	S_ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64DFR0_DOUBLELOCK_SHIFT, 4, 0),
 	ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64DFR0_PMSVER_SHIFT, 4, 0),
 	ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64DFR0_CTX_CMPS_SHIFT, 4, 0),
 	ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64DFR0_WRPS_SHIFT, 4, 0),
@@ -398,14 +398,14 @@ static const struct arm64_ftr_bits ftr_id_aa64dfr0[] = {
 };
 
 static const struct arm64_ftr_bits ftr_mvfr2[] = {
-	ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, 4, 4, 0),		/* FPMisc */
-	ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, 0, 4, 0),		/* SIMDMisc */
+	ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, MVFR2_FPMISC_SHIFT, 4, 0),
+	ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, MVFR2_SIMDMISC_SHIFT, 4, 0),
 	ARM64_FTR_END,
 };
 
 static const struct arm64_ftr_bits ftr_dczid[] = {
-	ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_EXACT, 4, 1, 1),		/* DZP */
-	ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, 0, 4, 0),	/* BS */
+	ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_EXACT, DCZID_DZP_SHIFT, 1, 1),
+	ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, DCZID_BS_SHIFT, 4, 0),
 	ARM64_FTR_END,
 };
 
@@ -437,7 +437,8 @@ static const struct arm64_ftr_bits ftr_id_mmfr4[] = {
 	ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_MMFR4_HPDS_SHIFT, 4, 0),
 	ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_MMFR4_CNP_SHIFT, 4, 0),
 	ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_MMFR4_XNX_SHIFT, 4, 0),
-	ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, 4, 4, 0),	/* ac2 */
+	ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_MMFR4_AC2_SHIFT, 4, 0),
+
 	/*
 	 * SpecSEI = 1 indicates that the PE might generate an SError on an
 	 * external abort on speculative read. It is safe to assume that an
@@ -479,10 +480,10 @@ static const struct arm64_ftr_bits ftr_id_isar6[] = {
 static const struct arm64_ftr_bits ftr_id_pfr0[] = {
 	ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_PFR0_DIT_SHIFT, 4, 0),
 	ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_PFR0_CSV2_SHIFT, 4, 0),
-	ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, 12, 4, 0),		/* State3 */
-	ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, 8, 4, 0),		/* State2 */
-	ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, 4, 4, 0),		/* State1 */
-	ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, 0, 4, 0),		/* State0 */
+	ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_PFR0_STATE3_SHIFT, 4, 0),
+	ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_PFR0_STATE2_SHIFT, 4, 0),
+	ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_PFR0_STATE1_SHIFT, 4, 0),
+	ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_PFR0_STATE0_SHIFT, 4, 0),
 	ARM64_FTR_END,
 };
 
@@ -506,13 +507,13 @@ static const struct arm64_ftr_bits ftr_id_pfr2[] = {
 
 static const struct arm64_ftr_bits ftr_id_dfr0[] = {
 	/* [31:28] TraceFilt */
-	S_ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, 24, 4, 0xf),	/* PerfMon */
-	ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, 20, 4, 0),
-	ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, 16, 4, 0),
-	ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, 12, 4, 0),
-	ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, 8, 4, 0),
-	ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, 4, 4, 0),
-	ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, 0, 4, 0),
+	S_ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_DFR0_PERFMON_SHIFT, 4, 0xf),
+	ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_DFR0_MPROFDBG_SHIFT, 4, 0),
+	ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_DFR0_MMAPTRC_SHIFT, 4, 0),
+	ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_DFR0_COPTRC_SHIFT, 4, 0),
+	ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_DFR0_MMAPDBG_SHIFT, 4, 0),
+	ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_DFR0_COPSDBG_SHIFT, 4, 0),
+	ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_DFR0_COPDBG_SHIFT, 4, 0),
 	ARM64_FTR_END,
 };
 

From 2a379716f3d76aebc5574155de247b547a0214cc Mon Sep 17 00:00:00 2001
From: Bhupesh Sharma <bhsharma@redhat.com>
Date: Tue, 7 Apr 2020 04:01:40 +0530
Subject: [PATCH 263/502] arm64/defconfig: Enable CONFIG_KEXEC_FILE

kexec_file_load() syscall interface is now supported for
arm64 architecture as well via commits:
3751e728cef2 ("arm64: kexec_file: add crash dump support") and
3ddd9992a590 ("arm64: enable KEXEC_FILE config")].

This patch enables config KEXEC_FILE by default in the
arm64 defconfig, so that user-space tools like kexec-tools
can use the same as the default interface for kexec/kdump
on arm64.

Cc: AKASHI Takahiro <takahiro.akashi@linaro.org>
Cc: James Morse <james.morse@arm.com>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Will Deacon <will@kernel.org>
Cc: kexec@lists.infradead.org

Signed-off-by: Bhupesh Sharma <bhsharma@redhat.com>
Acked-by: Mark Rutland <mark.rutland@arm.com>
Link: https://lore.kernel.org/r/1586212300-30797-1-git-send-email-bhsharma@redhat.com
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 arch/arm64/configs/defconfig | 1 +
 1 file changed, 1 insertion(+)

diff --git a/arch/arm64/configs/defconfig b/arch/arm64/configs/defconfig
index 883e8bace3ed..1a33697a8492 100644
--- a/arch/arm64/configs/defconfig
+++ b/arch/arm64/configs/defconfig
@@ -66,6 +66,7 @@ CONFIG_SCHED_SMT=y
 CONFIG_NUMA=y
 CONFIG_SECCOMP=y
 CONFIG_KEXEC=y
+CONFIG_KEXEC_FILE=y
 CONFIG_CRASH_DUMP=y
 CONFIG_XEN=y
 CONFIG_COMPAT=y

From a1634a542f74309f843742fa849208bb26e279e4 Mon Sep 17 00:00:00 2001
From: Gavin Shan <gshan@redhat.com>
Date: Tue, 30 Jun 2020 16:24:28 +1000
Subject: [PATCH 264/502] arm64/mm: Redefine CONT_{PTE, PMD}_SHIFT

Currently, the value of CONT_{PTE, PMD}_SHIFT is off from standard
{PAGE, PMD}_SHIFT. In turn, we have to consider adding {PAGE, PMD}_SHIFT
when using CONT_{PTE, PMD}_SHIFT in the function hugetlbpage_init().
It's a bit confusing.

This redefines CONT_{PTE, PMD}_SHIFT with {PAGE, PMD}_SHIFT included
so that the later values needn't be added when using the former ones
in function hugetlbpage_init(). Note that the values of CONT_{PTES, PMDS}
are unchanged.

Suggested-by: Will Deacon <will@kernel.org>
Signed-off-by: Gavin Shan <gshan@redhat.com>
Reviewed-by: Anshuman Khandual <anshuman.khandual@arm.com>
Link: https://lkml.org/lkml/2020/5/6/190
Link: https://lore.kernel.org/r/20200630062428.194235-1-gshan@redhat.com
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 arch/arm64/include/asm/pgtable-hwdef.h | 16 ++++++++--------
 arch/arm64/mm/hugetlbpage.c            |  4 ++--
 2 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/arch/arm64/include/asm/pgtable-hwdef.h b/arch/arm64/include/asm/pgtable-hwdef.h
index 9c91a8f93a0e..ce3d14abb360 100644
--- a/arch/arm64/include/asm/pgtable-hwdef.h
+++ b/arch/arm64/include/asm/pgtable-hwdef.h
@@ -82,20 +82,20 @@
  * Contiguous page definitions.
  */
 #ifdef CONFIG_ARM64_64K_PAGES
-#define CONT_PTE_SHIFT		5
-#define CONT_PMD_SHIFT		5
+#define CONT_PTE_SHIFT		(5 + PAGE_SHIFT)
+#define CONT_PMD_SHIFT		(5 + PMD_SHIFT)
 #elif defined(CONFIG_ARM64_16K_PAGES)
-#define CONT_PTE_SHIFT		7
-#define CONT_PMD_SHIFT		5
+#define CONT_PTE_SHIFT		(7 + PAGE_SHIFT)
+#define CONT_PMD_SHIFT		(5 + PMD_SHIFT)
 #else
-#define CONT_PTE_SHIFT		4
-#define CONT_PMD_SHIFT		4
+#define CONT_PTE_SHIFT		(4 + PAGE_SHIFT)
+#define CONT_PMD_SHIFT		(4 + PMD_SHIFT)
 #endif
 
-#define CONT_PTES		(1 << CONT_PTE_SHIFT)
+#define CONT_PTES		(1 << (CONT_PTE_SHIFT - PAGE_SHIFT))
 #define CONT_PTE_SIZE		(CONT_PTES * PAGE_SIZE)
 #define CONT_PTE_MASK		(~(CONT_PTE_SIZE - 1))
-#define CONT_PMDS		(1 << CONT_PMD_SHIFT)
+#define CONT_PMDS		(1 << (CONT_PMD_SHIFT - PMD_SHIFT))
 #define CONT_PMD_SIZE		(CONT_PMDS * PMD_SIZE)
 #define CONT_PMD_MASK		(~(CONT_PMD_SIZE - 1))
 /* the the numerical offset of the PTE within a range of CONT_PTES */
diff --git a/arch/arm64/mm/hugetlbpage.c b/arch/arm64/mm/hugetlbpage.c
index 0a52ce46f020..c79084739096 100644
--- a/arch/arm64/mm/hugetlbpage.c
+++ b/arch/arm64/mm/hugetlbpage.c
@@ -457,9 +457,9 @@ static int __init hugetlbpage_init(void)
 #ifdef CONFIG_ARM64_4K_PAGES
 	hugetlb_add_hstate(PUD_SHIFT - PAGE_SHIFT);
 #endif
-	hugetlb_add_hstate((CONT_PMD_SHIFT + PMD_SHIFT) - PAGE_SHIFT);
+	hugetlb_add_hstate(CONT_PMD_SHIFT - PAGE_SHIFT);
 	hugetlb_add_hstate(PMD_SHIFT - PAGE_SHIFT);
-	hugetlb_add_hstate((CONT_PTE_SHIFT + PAGE_SHIFT) - PAGE_SHIFT);
+	hugetlb_add_hstate(CONT_PTE_SHIFT - PAGE_SHIFT);
 
 	return 0;
 }

From 4c6e277c4cc4a6b3b2b9c66a7b014787ae757cc1 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Wed, 1 Jul 2020 11:29:10 -0600
Subject: [PATCH 265/502] io_uring: abstract out task work running

Provide a helper to run task_work instead of checking and running
manually in a bunch of different spots. While doing so, also move the
task run state setting where we run the task work. Then we can move it
out of the callback helpers. This also helps ensure we only do this once
per task_work list run, not per task_work item.

Suggested-by: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 36 +++++++++++++++++++-----------------
 1 file changed, 19 insertions(+), 17 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 7426e4f23f9b..65a6978e1795 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -1714,7 +1714,6 @@ static void __io_req_task_submit(struct io_kiocb *req)
 {
 	struct io_ring_ctx *ctx = req->ctx;
 
-	__set_current_state(TASK_RUNNING);
 	if (!__io_sq_thread_acquire_mm(ctx)) {
 		mutex_lock(&ctx->uring_lock);
 		__io_queue_sqe(req, NULL, NULL);
@@ -1899,6 +1898,17 @@ static int io_put_kbuf(struct io_kiocb *req)
 	return cflags;
 }
 
+static inline bool io_run_task_work(void)
+{
+	if (current->task_works) {
+		__set_current_state(TASK_RUNNING);
+		task_work_run();
+		return true;
+	}
+
+	return false;
+}
+
 static void io_iopoll_queue(struct list_head *again)
 {
 	struct io_kiocb *req;
@@ -2079,8 +2089,7 @@ static int io_iopoll_check(struct io_ring_ctx *ctx, unsigned *nr_events,
 		 */
 		if (!(++iters & 7)) {
 			mutex_unlock(&ctx->uring_lock);
-			if (current->task_works)
-				task_work_run();
+			io_run_task_work();
 			mutex_lock(&ctx->uring_lock);
 		}
 
@@ -2176,8 +2185,6 @@ static void io_rw_resubmit(struct callback_head *cb)
 	struct io_ring_ctx *ctx = req->ctx;
 	int err;
 
-	__set_current_state(TASK_RUNNING);
-
 	err = io_sq_thread_acquire_mm(ctx, req);
 
 	if (io_resubmit_prep(req, err)) {
@@ -6361,8 +6368,7 @@ static int io_sq_thread(void *data)
 			if (!list_empty(&ctx->poll_list) || need_resched() ||
 			    (!time_after(jiffies, timeout) && ret != -EBUSY &&
 			    !percpu_ref_is_dying(&ctx->refs))) {
-				if (current->task_works)
-					task_work_run();
+				io_run_task_work();
 				cond_resched();
 				continue;
 			}
@@ -6394,8 +6400,7 @@ static int io_sq_thread(void *data)
 					finish_wait(&ctx->sqo_wait, &wait);
 					break;
 				}
-				if (current->task_works) {
-					task_work_run();
+				if (io_run_task_work()) {
 					finish_wait(&ctx->sqo_wait, &wait);
 					continue;
 				}
@@ -6420,8 +6425,7 @@ static int io_sq_thread(void *data)
 		timeout = jiffies + ctx->sq_thread_idle;
 	}
 
-	if (current->task_works)
-		task_work_run();
+	io_run_task_work();
 
 	io_sq_thread_drop_mm(ctx);
 	revert_creds(old_cred);
@@ -6486,9 +6490,8 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
 	do {
 		if (io_cqring_events(ctx, false) >= min_events)
 			return 0;
-		if (!current->task_works)
+		if (!io_run_task_work())
 			break;
-		task_work_run();
 	} while (1);
 
 	if (sig) {
@@ -6510,8 +6513,8 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
 		prepare_to_wait_exclusive(&ctx->wait, &iowq.wq,
 						TASK_INTERRUPTIBLE);
 		/* make sure we run task_work before checking for signals */
-		if (current->task_works)
-			task_work_run();
+		if (io_run_task_work())
+			continue;
 		if (signal_pending(current)) {
 			if (current->jobctl & JOBCTL_TASK_WORK) {
 				spin_lock_irq(&current->sighand->siglock);
@@ -7953,8 +7956,7 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit,
 	int submitted = 0;
 	struct fd f;
 
-	if (current->task_works)
-		task_work_run();
+	io_run_task_work();
 
 	if (flags & ~(IORING_ENTER_GETEVENTS | IORING_ENTER_SQ_WAKEUP))
 		return -EINVAL;

From c2c4c83c58cbca23527fee93b49738a5a84272a1 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Wed, 1 Jul 2020 15:37:11 -0600
Subject: [PATCH 266/502] io_uring: use new io_req_task_work_add() helper
 throughout

Since we now have that in the 5.9 branch, convert the existing users of
task_work_add() to use this new helper.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 77 +++++++++++++++++++++++++--------------------------
 1 file changed, 37 insertions(+), 40 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 65a6978e1795..2b849984bae5 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -1689,6 +1689,29 @@ static struct io_kiocb *io_req_find_next(struct io_kiocb *req)
 	return __io_req_find_next(req);
 }
 
+static int io_req_task_work_add(struct io_kiocb *req, struct callback_head *cb)
+{
+	struct task_struct *tsk = req->task;
+	struct io_ring_ctx *ctx = req->ctx;
+	int ret, notify = TWA_RESUME;
+
+	/*
+	 * SQPOLL kernel thread doesn't need notification, just a wakeup.
+	 * If we're not using an eventfd, then TWA_RESUME is always fine,
+	 * as we won't have dependencies between request completions for
+	 * other kernel wait conditions.
+	 */
+	if (ctx->flags & IORING_SETUP_SQPOLL)
+		notify = 0;
+	else if (ctx->cq_ev_fd)
+		notify = TWA_SIGNAL;
+
+	ret = task_work_add(tsk, cb, notify);
+	if (!ret)
+		wake_up_process(tsk);
+	return ret;
+}
+
 static void __io_req_task_cancel(struct io_kiocb *req, int error)
 {
 	struct io_ring_ctx *ctx = req->ctx;
@@ -1732,18 +1755,19 @@ static void io_req_task_submit(struct callback_head *cb)
 
 static void io_req_task_queue(struct io_kiocb *req)
 {
-	struct task_struct *tsk = req->task;
 	int ret;
 
 	init_task_work(&req->task_work, io_req_task_submit);
 
-	ret = task_work_add(tsk, &req->task_work, true);
+	ret = io_req_task_work_add(req, &req->task_work);
 	if (unlikely(ret)) {
+		struct task_struct *tsk;
+
 		init_task_work(&req->task_work, io_req_task_cancel);
 		tsk = io_wq_get_task(req->ctx->io_wq);
-		task_work_add(tsk, &req->task_work, true);
+		task_work_add(tsk, &req->task_work, 0);
+		wake_up_process(tsk);
 	}
-	wake_up_process(tsk);
 }
 
 static void io_queue_next(struct io_kiocb *req)
@@ -2197,19 +2221,15 @@ static void io_rw_resubmit(struct callback_head *cb)
 static bool io_rw_reissue(struct io_kiocb *req, long res)
 {
 #ifdef CONFIG_BLOCK
-	struct task_struct *tsk;
 	int ret;
 
 	if ((res != -EAGAIN && res != -EOPNOTSUPP) || io_wq_current_is_worker())
 		return false;
 
-	tsk = req->task;
 	init_task_work(&req->task_work, io_rw_resubmit);
-	ret = task_work_add(tsk, &req->task_work, true);
-	if (!ret) {
-		wake_up_process(tsk);
+	ret = io_req_task_work_add(req, &req->task_work);
+	if (!ret)
 		return true;
-	}
 #endif
 	return false;
 }
@@ -2909,7 +2929,6 @@ static int io_async_buf_func(struct wait_queue_entry *wait, unsigned mode,
 	struct io_kiocb *req = wait->private;
 	struct io_async_rw *rw = &req->io->rw;
 	struct wait_page_key *key = arg;
-	struct task_struct *tsk;
 	int ret;
 
 	wpq = container_of(wait, struct wait_page_queue, wait);
@@ -2923,15 +2942,16 @@ static int io_async_buf_func(struct wait_queue_entry *wait, unsigned mode,
 	init_task_work(&rw->task_work, io_async_buf_retry);
 	/* submit ref gets dropped, acquire a new one */
 	refcount_inc(&req->refs);
-	tsk = req->task;
-	ret = task_work_add(tsk, &rw->task_work, true);
+	ret = io_req_task_work_add(req, &rw->task_work);
 	if (unlikely(ret)) {
+		struct task_struct *tsk;
+
 		/* queue just for cancelation */
 		init_task_work(&rw->task_work, io_async_buf_cancel);
 		tsk = io_wq_get_task(req->ctx->io_wq);
-		task_work_add(tsk, &rw->task_work, true);
+		task_work_add(tsk, &rw->task_work, 0);
+		wake_up_process(tsk);
 	}
-	wake_up_process(tsk);
 	return 1;
 }
 
@@ -4424,33 +4444,9 @@ struct io_poll_table {
 	int error;
 };
 
-static int io_req_task_work_add(struct io_kiocb *req, struct callback_head *cb)
-{
-	struct task_struct *tsk = req->task;
-	struct io_ring_ctx *ctx = req->ctx;
-	int ret, notify = TWA_RESUME;
-
-	/*
-	 * SQPOLL kernel thread doesn't need notification, just a wakeup.
-	 * If we're not using an eventfd, then TWA_RESUME is always fine,
-	 * as we won't have dependencies between request completions for
-	 * other kernel wait conditions.
-	 */
-	if (ctx->flags & IORING_SETUP_SQPOLL)
-		notify = 0;
-	else if (ctx->cq_ev_fd)
-		notify = TWA_SIGNAL;
-
-	ret = task_work_add(tsk, cb, notify);
-	if (!ret)
-		wake_up_process(tsk);
-	return ret;
-}
-
 static int __io_async_wake(struct io_kiocb *req, struct io_poll_iocb *poll,
 			   __poll_t mask, task_work_func_t func)
 {
-	struct task_struct *tsk;
 	int ret;
 
 	/* for instances that support it check for an event match first: */
@@ -4461,7 +4457,6 @@ static int __io_async_wake(struct io_kiocb *req, struct io_poll_iocb *poll,
 
 	list_del_init(&poll->wait.entry);
 
-	tsk = req->task;
 	req->result = mask;
 	init_task_work(&req->task_work, func);
 	/*
@@ -4472,6 +4467,8 @@ static int __io_async_wake(struct io_kiocb *req, struct io_poll_iocb *poll,
 	 */
 	ret = io_req_task_work_add(req, &req->task_work);
 	if (unlikely(ret)) {
+		struct task_struct *tsk;
+
 		WRITE_ONCE(poll->canceled, true);
 		tsk = io_wq_get_task(req->ctx->io_wq);
 		task_work_add(tsk, &req->task_work, 0);

From 6df1db6b542436c6d429caa66e1045862fa36155 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Fri, 3 Jul 2020 22:15:06 +0300
Subject: [PATCH 267/502] io_uring: fix mis-refcounting linked timeouts

io_prep_linked_timeout() sets REQ_F_LINK_TIMEOUT altering refcounting of
the following linked request. After that someone should call
io_queue_linked_timeout(), otherwise a submission reference of the linked
timeout won't be ever dropped.

That's what happens in io_steal_work() if io-wq decides to postpone linked
request with io_wqe_enqueue(). io_queue_linked_timeout() can also be
potentially called twice without synchronisation during re-submission,
e.g. io_rw_resubmit().

There are the rules, whoever did io_prep_linked_timeout() must also call
io_queue_linked_timeout(). To not do it twice, io_prep_linked_timeout()
will return non NULL only for the first call. That's controlled by
REQ_F_LINK_TIMEOUT flag.

Also kill REQ_F_QUEUE_TIMEOUT.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 33 +++++++--------------------------
 1 file changed, 7 insertions(+), 26 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 2b849984bae5..cf1b3d4ac241 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -538,7 +538,6 @@ enum {
 	REQ_F_POLLED_BIT,
 	REQ_F_BUFFER_SELECTED_BIT,
 	REQ_F_NO_FILE_TABLE_BIT,
-	REQ_F_QUEUE_TIMEOUT_BIT,
 	REQ_F_WORK_INITIALIZED_BIT,
 	REQ_F_TASK_PINNED_BIT,
 
@@ -586,8 +585,6 @@ enum {
 	REQ_F_BUFFER_SELECTED	= BIT(REQ_F_BUFFER_SELECTED_BIT),
 	/* doesn't need file table for this request */
 	REQ_F_NO_FILE_TABLE	= BIT(REQ_F_NO_FILE_TABLE_BIT),
-	/* needs to queue linked timeout */
-	REQ_F_QUEUE_TIMEOUT	= BIT(REQ_F_QUEUE_TIMEOUT_BIT),
 	/* io_wq_work is initialized */
 	REQ_F_WORK_INITIALIZED	= BIT(REQ_F_WORK_INITIALIZED_BIT),
 	/* req->task is refcounted */
@@ -1842,7 +1839,7 @@ static void io_put_req(struct io_kiocb *req)
 
 static struct io_wq_work *io_steal_work(struct io_kiocb *req)
 {
-	struct io_kiocb *timeout, *nxt = NULL;
+	struct io_kiocb *nxt;
 
 	/*
 	 * A ref is owned by io-wq in which context we're. So, if that's the
@@ -1853,13 +1850,7 @@ static struct io_wq_work *io_steal_work(struct io_kiocb *req)
 		return NULL;
 
 	nxt = io_req_find_next(req);
-	if (!nxt)
-		return NULL;
-
-	timeout = io_prep_linked_timeout(nxt);
-	if (timeout)
-		nxt->flags |= REQ_F_QUEUE_TIMEOUT;
-	return &nxt->work;
+	return nxt ? &nxt->work : NULL;
 }
 
 /*
@@ -5702,24 +5693,15 @@ static int io_issue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe,
 	return 0;
 }
 
-static void io_arm_async_linked_timeout(struct io_kiocb *req)
-{
-	struct io_kiocb *link;
-
-	/* link head's timeout is queued in io_queue_async_work() */
-	if (!(req->flags & REQ_F_QUEUE_TIMEOUT))
-		return;
-
-	link = list_first_entry(&req->link_list, struct io_kiocb, link_list);
-	io_queue_linked_timeout(link);
-}
-
 static struct io_wq_work *io_wq_submit_work(struct io_wq_work *work)
 {
 	struct io_kiocb *req = container_of(work, struct io_kiocb, work);
+	struct io_kiocb *timeout;
 	int ret = 0;
 
-	io_arm_async_linked_timeout(req);
+	timeout = io_prep_linked_timeout(req);
+	if (timeout)
+		io_queue_linked_timeout(timeout);
 
 	/* if NO_CANCEL is set, we must still run the work */
 	if ((work->flags & (IO_WQ_WORK_CANCEL|IO_WQ_WORK_NO_CANCEL)) ==
@@ -5893,8 +5875,7 @@ static struct io_kiocb *io_prep_linked_timeout(struct io_kiocb *req)
 
 	if (!(req->flags & REQ_F_LINK_HEAD))
 		return NULL;
-	/* for polled retry, if flag is set, we already went through here */
-	if (req->flags & REQ_F_POLLED)
+	if (req->flags & REQ_F_LINK_TIMEOUT)
 		return NULL;
 
 	nxt = list_first_entry_or_null(&req->link_list, struct io_kiocb,

From 652532ad459524d32c6bf1522e0b88d83b084d1a Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Fri, 3 Jul 2020 22:15:07 +0300
Subject: [PATCH 268/502] io_uring: keep queue_sqe()'s fail path separately

A preparation path, extracts error path into a separate block. It looks
saner then calling req_set_fail_links() after io_put_req_find_next(), even
though it have been working well.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 23 +++++++++++------------
 1 file changed, 11 insertions(+), 12 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index cf1b3d4ac241..7147e87a24b5 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -5937,22 +5937,21 @@ punt:
 		goto exit;
 	}
 
+	if (unlikely(ret)) {
 err:
+		/* un-prep timeout, so it'll be killed as any other linked */
+		req->flags &= ~REQ_F_LINK_TIMEOUT;
+		req_set_fail_links(req);
+		io_put_req(req);
+		io_req_complete(req, ret);
+		goto exit;
+	}
+
 	/* drop submission reference */
 	nxt = io_put_req_find_next(req);
+	if (linked_timeout)
+		io_queue_linked_timeout(linked_timeout);
 
-	if (linked_timeout) {
-		if (!ret)
-			io_queue_linked_timeout(linked_timeout);
-		else
-			io_put_req(linked_timeout);
-	}
-
-	/* and drop final reference, if we failed */
-	if (ret) {
-		req_set_fail_links(req);
-		io_req_complete(req, ret);
-	}
 	if (nxt) {
 		req = nxt;
 

From 8b3656af2a37dc538d21e144a5a94bacae05e9f1 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Fri, 3 Jul 2020 22:15:08 +0300
Subject: [PATCH 269/502] io_uring: fix lost cqe->flags

Don't forget to fill cqe->flags properly in io_submit_flush_completions()

Fixes: a1d7c393c4711 ("io_uring: enable READ/WRITE to use deferred completions")
Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 7147e87a24b5..9464f9470bbc 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -1416,7 +1416,7 @@ static void io_submit_flush_completions(struct io_comp_state *cs)
 
 		req = list_first_entry(&cs->list, struct io_kiocb, list);
 		list_del(&req->list);
-		io_cqring_fill_event(req, req->result);
+		__io_cqring_fill_event(req, req->result, req->cflags);
 		if (!(req->flags & REQ_F_LINK_HEAD)) {
 			req->flags |= REQ_F_COMP_LOCKED;
 			io_put_req(req);
@@ -1441,6 +1441,7 @@ static void __io_req_complete(struct io_kiocb *req, long res, unsigned cflags,
 		io_put_req(req);
 	} else {
 		req->result = res;
+		req->cflags = cflags;
 		list_add_tail(&req->list, &cs->list);
 		if (++cs->nr >= 32)
 			io_submit_flush_completions(cs);

From 3aadc23e6054353ca056bf14e87250c79efbd7ed Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Mon, 6 Jul 2020 17:59:29 +0300
Subject: [PATCH 270/502] io_uring: don't delay iopoll'ed req completion

->iopoll() may have completed current request, but instead of reaping
it, io_do_iopoll() just continues with the next request in the list.
As a result it can leave just polled and completed request in the list
up until next syscall. Even outer loop in io_iopoll_getevents() doesn't
help the situation.

E.g. poll_list: req0 -> req1
If req0->iopoll() completed both requests, and @min<=1,
then @req0 will be left behind.

Check whether a req was completed after ->iopoll().

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 9464f9470bbc..60f1a81c6c35 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -2015,6 +2015,10 @@ static int io_do_iopoll(struct io_ring_ctx *ctx, unsigned int *nr_events,
 		if (ret < 0)
 			break;
 
+		/* iopoll may have completed current req */
+		if (READ_ONCE(req->iopoll_completed))
+			list_move_tail(&req->list, &done);
+
 		if (ret && spin)
 			spin = false;
 		ret = 0;

From eba0a4dd2aa5c47ca5b0c56ffb6d6665e047ff72 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Mon, 6 Jul 2020 17:59:30 +0300
Subject: [PATCH 271/502] io_uring: fix stopping iopoll'ing too early

Nobody adjusts *nr_events (number of completed requests) before calling
io_iopoll_getevents(), so the passed @min shouldn't be adjusted as well.
Othewise it can return less than initially asked @min without hitting
need_resched().

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 9 ++-------
 1 file changed, 2 insertions(+), 7 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 60f1a81c6c35..332008f346e3 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -2044,7 +2044,7 @@ static int io_iopoll_getevents(struct io_ring_ctx *ctx, unsigned int *nr_events,
 		ret = io_do_iopoll(ctx, nr_events, min);
 		if (ret < 0)
 			return ret;
-		if (!min || *nr_events >= min)
+		if (*nr_events >= min)
 			return 0;
 	}
 
@@ -2087,8 +2087,6 @@ static int io_iopoll_check(struct io_ring_ctx *ctx, unsigned *nr_events,
 	 */
 	mutex_lock(&ctx->uring_lock);
 	do {
-		int tmin = 0;
-
 		/*
 		 * Don't enter poll loop if we already have events pending.
 		 * If we do, we can potentially be spinning for commands that
@@ -2113,10 +2111,7 @@ static int io_iopoll_check(struct io_ring_ctx *ctx, unsigned *nr_events,
 			mutex_lock(&ctx->uring_lock);
 		}
 
-		if (*nr_events < min)
-			tmin = min - *nr_events;
-
-		ret = io_iopoll_getevents(ctx, nr_events, tmin);
+		ret = io_iopoll_getevents(ctx, nr_events, min);
 		if (ret <= 0)
 			break;
 		ret = 0;

From 3fcee5a6d5414df8ff4ee22f2477bde76d34527c Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Mon, 6 Jul 2020 17:59:31 +0300
Subject: [PATCH 272/502] io_uring: briefly loose locks while reaping events

It's not nice to hold @uring_lock for too long io_iopoll_reap_events().
For instance, the lock is needed to publish requests to @poll_list, and
that locks out tasks doing that for no good reason. Loose it
occasionally.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 332008f346e3..6e3169834bf7 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -2069,8 +2069,13 @@ static void io_iopoll_reap_events(struct io_ring_ctx *ctx)
 		/*
 		 * Ensure we allow local-to-the-cpu processing to take place,
 		 * in this case we need to ensure that we reap all events.
+		 * Also let task_work, etc. to progress by releasing the mutex
 		 */
-		cond_resched();
+		if (need_resched()) {
+			mutex_unlock(&ctx->uring_lock);
+			cond_resched();
+			mutex_lock(&ctx->uring_lock);
+		}
 	}
 	mutex_unlock(&ctx->uring_lock);
 }

From b037b09b9058d84882fa2c4db3806433e2b0f912 Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Fri, 3 Jul 2020 10:02:58 -0700
Subject: [PATCH 273/502] x86/entry: Rename idtentry_enter/exit_cond_rcu() to
 idtentry_enter/exit()

They were originally called _cond_rcu because they were special versions
with conditional RCU handling.  Now they're the standard entry and exit
path, so the _cond_rcu part is just confusing.  Drop it.

Also change the signature to make them more extensible and more foolproof.

No functional change -- it's pure refactoring.

Signed-off-by: Andy Lutomirski <luto@kernel.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/247fc67685263e0b673e1d7f808182d28ff80359.1593795633.git.luto@kernel.org
---
 arch/x86/entry/common.c         | 50 ++++++++++++++++++---------------
 arch/x86/include/asm/idtentry.h | 28 ++++++++++--------
 arch/x86/kernel/kvm.c           |  6 ++--
 arch/x86/kernel/traps.c         |  6 ++--
 arch/x86/mm/fault.c             |  6 ++--
 5 files changed, 53 insertions(+), 43 deletions(-)

diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c
index e83b3f14897c..0521546022cb 100644
--- a/arch/x86/entry/common.c
+++ b/arch/x86/entry/common.c
@@ -559,8 +559,7 @@ SYSCALL_DEFINE0(ni_syscall)
 }
 
 /**
- * idtentry_enter_cond_rcu - Handle state tracking on idtentry with conditional
- *			     RCU handling
+ * idtentry_enter - Handle state tracking on ordinary idtentries
  * @regs:	Pointer to pt_regs of interrupted context
  *
  * Invokes:
@@ -572,6 +571,9 @@ SYSCALL_DEFINE0(ni_syscall)
  *  - The hardirq tracer to keep the state consistent as low level ASM
  *    entry disabled interrupts.
  *
+ * As a precondition, this requires that the entry came from user mode,
+ * idle, or a kernel context in which RCU is watching.
+ *
  * For kernel mode entries RCU handling is done conditional. If RCU is
  * watching then the only RCU requirement is to check whether the tick has
  * to be restarted. If RCU is not watching then rcu_irq_enter() has to be
@@ -585,18 +587,21 @@ SYSCALL_DEFINE0(ni_syscall)
  * establish the proper context for NOHZ_FULL. Otherwise scheduling on exit
  * would not be possible.
  *
- * Returns: True if RCU has been adjusted on a kernel entry
- *	    False otherwise
+ * Returns: An opaque object that must be passed to idtentry_exit()
  *
- * The return value must be fed into the rcu_exit argument of
- * idtentry_exit_cond_rcu().
+ * The return value must be fed into the state argument of
+ * idtentry_exit().
  */
-bool noinstr idtentry_enter_cond_rcu(struct pt_regs *regs)
+idtentry_state_t noinstr idtentry_enter(struct pt_regs *regs)
 {
+	idtentry_state_t ret = {
+		.exit_rcu = false,
+	};
+
 	if (user_mode(regs)) {
 		check_user_regs(regs);
 		enter_from_user_mode();
-		return false;
+		return ret;
 	}
 
 	/*
@@ -634,7 +639,8 @@ bool noinstr idtentry_enter_cond_rcu(struct pt_regs *regs)
 		trace_hardirqs_off_finish();
 		instrumentation_end();
 
-		return true;
+		ret.exit_rcu = true;
+		return ret;
 	}
 
 	/*
@@ -649,7 +655,7 @@ bool noinstr idtentry_enter_cond_rcu(struct pt_regs *regs)
 	trace_hardirqs_off();
 	instrumentation_end();
 
-	return false;
+	return ret;
 }
 
 static void idtentry_exit_cond_resched(struct pt_regs *regs, bool may_sched)
@@ -667,10 +673,9 @@ static void idtentry_exit_cond_resched(struct pt_regs *regs, bool may_sched)
 }
 
 /**
- * idtentry_exit_cond_rcu - Handle return from exception with conditional RCU
- *			    handling
+ * idtentry_exit - Handle return from exception that used idtentry_enter()
  * @regs:	Pointer to pt_regs (exception entry regs)
- * @rcu_exit:	Invoke rcu_irq_exit() if true
+ * @state:	Return value from matching call to idtentry_enter()
  *
  * Depending on the return target (kernel/user) this runs the necessary
  * preemption and work checks if possible and reguired and returns to
@@ -679,10 +684,10 @@ static void idtentry_exit_cond_resched(struct pt_regs *regs, bool may_sched)
  * This is the last action before returning to the low level ASM code which
  * just needs to return to the appropriate context.
  *
- * Counterpart to idtentry_enter_cond_rcu(). The return value of the entry
- * function must be fed into the @rcu_exit argument.
+ * Counterpart to idtentry_enter(). The return value of the entry
+ * function must be fed into the @state argument.
  */
-void noinstr idtentry_exit_cond_rcu(struct pt_regs *regs, bool rcu_exit)
+void noinstr idtentry_exit(struct pt_regs *regs, idtentry_state_t state)
 {
 	lockdep_assert_irqs_disabled();
 
@@ -695,7 +700,7 @@ void noinstr idtentry_exit_cond_rcu(struct pt_regs *regs, bool rcu_exit)
 		 * carefully and needs the same ordering of lockdep/tracing
 		 * and RCU as the return to user mode path.
 		 */
-		if (rcu_exit) {
+		if (state.exit_rcu) {
 			instrumentation_begin();
 			/* Tell the tracer that IRET will enable interrupts */
 			trace_hardirqs_on_prepare();
@@ -714,7 +719,7 @@ void noinstr idtentry_exit_cond_rcu(struct pt_regs *regs, bool rcu_exit)
 		 * IRQ flags state is correct already. Just tell RCU if it
 		 * was not watching on entry.
 		 */
-		if (rcu_exit)
+		if (state.exit_rcu)
 			rcu_irq_exit();
 	}
 }
@@ -800,9 +805,10 @@ static void __xen_pv_evtchn_do_upcall(void)
 __visible noinstr void xen_pv_evtchn_do_upcall(struct pt_regs *regs)
 {
 	struct pt_regs *old_regs;
-	bool inhcall, rcu_exit;
+	bool inhcall;
+	idtentry_state_t state;
 
-	rcu_exit = idtentry_enter_cond_rcu(regs);
+	state = idtentry_enter(regs);
 	old_regs = set_irq_regs(regs);
 
 	instrumentation_begin();
@@ -812,13 +818,13 @@ __visible noinstr void xen_pv_evtchn_do_upcall(struct pt_regs *regs)
 	set_irq_regs(old_regs);
 
 	inhcall = get_and_clear_inhcall();
-	if (inhcall && !WARN_ON_ONCE(rcu_exit)) {
+	if (inhcall && !WARN_ON_ONCE(state.exit_rcu)) {
 		instrumentation_begin();
 		idtentry_exit_cond_resched(regs, true);
 		instrumentation_end();
 		restore_inhcall(inhcall);
 	} else {
-		idtentry_exit_cond_rcu(regs, rcu_exit);
+		idtentry_exit(regs, state);
 	}
 }
 #endif /* CONFIG_XEN_PV */
diff --git a/arch/x86/include/asm/idtentry.h b/arch/x86/include/asm/idtentry.h
index eeac6dc2adaa..7227225cf45d 100644
--- a/arch/x86/include/asm/idtentry.h
+++ b/arch/x86/include/asm/idtentry.h
@@ -13,8 +13,12 @@
 void idtentry_enter_user(struct pt_regs *regs);
 void idtentry_exit_user(struct pt_regs *regs);
 
-bool idtentry_enter_cond_rcu(struct pt_regs *regs);
-void idtentry_exit_cond_rcu(struct pt_regs *regs, bool rcu_exit);
+typedef struct idtentry_state {
+	bool exit_rcu;
+} idtentry_state_t;
+
+idtentry_state_t idtentry_enter(struct pt_regs *regs);
+void idtentry_exit(struct pt_regs *regs, idtentry_state_t state);
 
 /**
  * DECLARE_IDTENTRY - Declare functions for simple IDT entry points
@@ -54,12 +58,12 @@ static __always_inline void __##func(struct pt_regs *regs);		\
 									\
 __visible noinstr void func(struct pt_regs *regs)			\
 {									\
-	bool rcu_exit = idtentry_enter_cond_rcu(regs);			\
+	idtentry_state_t state = idtentry_enter(regs);			\
 									\
 	instrumentation_begin();					\
 	__##func (regs);						\
 	instrumentation_end();						\
-	idtentry_exit_cond_rcu(regs, rcu_exit);				\
+	idtentry_exit(regs, state);					\
 }									\
 									\
 static __always_inline void __##func(struct pt_regs *regs)
@@ -101,12 +105,12 @@ static __always_inline void __##func(struct pt_regs *regs,		\
 __visible noinstr void func(struct pt_regs *regs,			\
 			    unsigned long error_code)			\
 {									\
-	bool rcu_exit = idtentry_enter_cond_rcu(regs);			\
+	idtentry_state_t state = idtentry_enter(regs);			\
 									\
 	instrumentation_begin();					\
 	__##func (regs, error_code);					\
 	instrumentation_end();						\
-	idtentry_exit_cond_rcu(regs, rcu_exit);				\
+	idtentry_exit(regs, state);					\
 }									\
 									\
 static __always_inline void __##func(struct pt_regs *regs,		\
@@ -199,7 +203,7 @@ static __always_inline void __##func(struct pt_regs *regs, u8 vector);	\
 __visible noinstr void func(struct pt_regs *regs,			\
 			    unsigned long error_code)			\
 {									\
-	bool rcu_exit = idtentry_enter_cond_rcu(regs);			\
+	idtentry_state_t state = idtentry_enter(regs);			\
 									\
 	instrumentation_begin();					\
 	irq_enter_rcu();						\
@@ -207,7 +211,7 @@ __visible noinstr void func(struct pt_regs *regs,			\
 	__##func (regs, (u8)error_code);				\
 	irq_exit_rcu();							\
 	instrumentation_end();						\
-	idtentry_exit_cond_rcu(regs, rcu_exit);				\
+	idtentry_exit(regs, state);					\
 }									\
 									\
 static __always_inline void __##func(struct pt_regs *regs, u8 vector)
@@ -241,7 +245,7 @@ static void __##func(struct pt_regs *regs);				\
 									\
 __visible noinstr void func(struct pt_regs *regs)			\
 {									\
-	bool rcu_exit = idtentry_enter_cond_rcu(regs);			\
+	idtentry_state_t state = idtentry_enter(regs);			\
 									\
 	instrumentation_begin();					\
 	irq_enter_rcu();						\
@@ -249,7 +253,7 @@ __visible noinstr void func(struct pt_regs *regs)			\
 	run_on_irqstack_cond(__##func, regs, regs);			\
 	irq_exit_rcu();							\
 	instrumentation_end();						\
-	idtentry_exit_cond_rcu(regs, rcu_exit);				\
+	idtentry_exit(regs, state);					\
 }									\
 									\
 static noinline void __##func(struct pt_regs *regs)
@@ -270,7 +274,7 @@ static __always_inline void __##func(struct pt_regs *regs);		\
 									\
 __visible noinstr void func(struct pt_regs *regs)			\
 {									\
-	bool rcu_exit = idtentry_enter_cond_rcu(regs);			\
+	idtentry_state_t state = idtentry_enter(regs);			\
 									\
 	instrumentation_begin();					\
 	__irq_enter_raw();						\
@@ -278,7 +282,7 @@ __visible noinstr void func(struct pt_regs *regs)			\
 	__##func (regs);						\
 	__irq_exit_raw();						\
 	instrumentation_end();						\
-	idtentry_exit_cond_rcu(regs, rcu_exit);				\
+	idtentry_exit(regs, state);					\
 }									\
 									\
 static __always_inline void __##func(struct pt_regs *regs)
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index df63786e7bfa..3f78482d9496 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -233,7 +233,7 @@ EXPORT_SYMBOL_GPL(kvm_read_and_reset_apf_flags);
 noinstr bool __kvm_handle_async_pf(struct pt_regs *regs, u32 token)
 {
 	u32 reason = kvm_read_and_reset_apf_flags();
-	bool rcu_exit;
+	idtentry_state_t state;
 
 	switch (reason) {
 	case KVM_PV_REASON_PAGE_NOT_PRESENT:
@@ -243,7 +243,7 @@ noinstr bool __kvm_handle_async_pf(struct pt_regs *regs, u32 token)
 		return false;
 	}
 
-	rcu_exit = idtentry_enter_cond_rcu(regs);
+	state = idtentry_enter(regs);
 	instrumentation_begin();
 
 	/*
@@ -264,7 +264,7 @@ noinstr bool __kvm_handle_async_pf(struct pt_regs *regs, u32 token)
 	}
 
 	instrumentation_end();
-	idtentry_exit_cond_rcu(regs, rcu_exit);
+	idtentry_exit(regs, state);
 	return true;
 }
 
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index b038695f36c5..4627f826fb57 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -245,7 +245,7 @@ static noinstr bool handle_bug(struct pt_regs *regs)
 
 DEFINE_IDTENTRY_RAW(exc_invalid_op)
 {
-	bool rcu_exit;
+	idtentry_state_t state;
 
 	/*
 	 * We use UD2 as a short encoding for 'CALL __WARN', as such
@@ -255,11 +255,11 @@ DEFINE_IDTENTRY_RAW(exc_invalid_op)
 	if (!user_mode(regs) && handle_bug(regs))
 		return;
 
-	rcu_exit = idtentry_enter_cond_rcu(regs);
+	state = idtentry_enter(regs);
 	instrumentation_begin();
 	handle_invalid_op(regs);
 	instrumentation_end();
-	idtentry_exit_cond_rcu(regs, rcu_exit);
+	idtentry_exit(regs, state);
 }
 
 DEFINE_IDTENTRY(exc_coproc_segment_overrun)
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index 1ead568c0101..5e41949453cc 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -1377,7 +1377,7 @@ handle_page_fault(struct pt_regs *regs, unsigned long error_code,
 DEFINE_IDTENTRY_RAW_ERRORCODE(exc_page_fault)
 {
 	unsigned long address = read_cr2();
-	bool rcu_exit;
+	idtentry_state_t state;
 
 	prefetchw(&current->mm->mmap_lock);
 
@@ -1412,11 +1412,11 @@ DEFINE_IDTENTRY_RAW_ERRORCODE(exc_page_fault)
 	 * code reenabled RCU to avoid subsequent wreckage which helps
 	 * debugability.
 	 */
-	rcu_exit = idtentry_enter_cond_rcu(regs);
+	state = idtentry_enter(regs);
 
 	instrumentation_begin();
 	handle_page_fault(regs, error_code, address);
 	instrumentation_end();
 
-	idtentry_exit_cond_rcu(regs, rcu_exit);
+	idtentry_exit(regs, state);
 }

From 552ae76face5584085845646c5f57e10c1a4ebdc Mon Sep 17 00:00:00 2001
From: Marc Zyngier <maz@kernel.org>
Date: Sat, 22 Dec 2018 12:00:10 +0000
Subject: [PATCH 274/502] arm64: Detect the ARMv8.4 TTL feature

In order to reduce the cost of TLB invalidation, the ARMv8.4 TTL
feature allows TLBs to be issued with a level allowing for quicker
invalidation.

Let's detect the feature for now. Further patches will implement
its actual usage.

Reviewed-by : Suzuki K Polose <suzuki.poulose@arm.com>
Reviewed-by: Catalin Marinas <catalin.marinas@arm.com>
Signed-off-by: Marc Zyngier <maz@kernel.org>
---
 arch/arm64/include/asm/cpucaps.h |  3 ++-
 arch/arm64/include/asm/sysreg.h  |  1 +
 arch/arm64/kernel/cpufeature.c   | 11 +++++++++++
 3 files changed, 14 insertions(+), 1 deletion(-)

diff --git a/arch/arm64/include/asm/cpucaps.h b/arch/arm64/include/asm/cpucaps.h
index d7b3bb0cb180..d44ba903d11d 100644
--- a/arch/arm64/include/asm/cpucaps.h
+++ b/arch/arm64/include/asm/cpucaps.h
@@ -62,7 +62,8 @@
 #define ARM64_HAS_GENERIC_AUTH			52
 #define ARM64_HAS_32BIT_EL1			53
 #define ARM64_BTI				54
+#define ARM64_HAS_ARMv8_4_TTL			55
 
-#define ARM64_NCAPS				55
+#define ARM64_NCAPS				56
 
 #endif /* __ASM_CPUCAPS_H */
diff --git a/arch/arm64/include/asm/sysreg.h b/arch/arm64/include/asm/sysreg.h
index 463175f80341..8c209aa17273 100644
--- a/arch/arm64/include/asm/sysreg.h
+++ b/arch/arm64/include/asm/sysreg.h
@@ -746,6 +746,7 @@
 
 /* id_aa64mmfr2 */
 #define ID_AA64MMFR2_E0PD_SHIFT		60
+#define ID_AA64MMFR2_TTL_SHIFT		48
 #define ID_AA64MMFR2_FWB_SHIFT		40
 #define ID_AA64MMFR2_AT_SHIFT		32
 #define ID_AA64MMFR2_LVA_SHIFT		16
diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c
index 9f63053a63a9..e877f56ff1ab 100644
--- a/arch/arm64/kernel/cpufeature.c
+++ b/arch/arm64/kernel/cpufeature.c
@@ -323,6 +323,7 @@ static const struct arm64_ftr_bits ftr_id_aa64mmfr1[] = {
 
 static const struct arm64_ftr_bits ftr_id_aa64mmfr2[] = {
 	ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64MMFR2_E0PD_SHIFT, 4, 0),
+	ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR2_TTL_SHIFT, 4, 0),
 	ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR2_FWB_SHIFT, 4, 0),
 	ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR2_AT_SHIFT, 4, 0),
 	ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR2_LVA_SHIFT, 4, 0),
@@ -1882,6 +1883,16 @@ static const struct arm64_cpu_capabilities arm64_features[] = {
 		.matches = has_cpuid_feature,
 		.cpu_enable = cpu_has_fwb,
 	},
+	{
+		.desc = "ARMv8.4 Translation Table Level",
+		.type = ARM64_CPUCAP_SYSTEM_FEATURE,
+		.capability = ARM64_HAS_ARMv8_4_TTL,
+		.sys_reg = SYS_ID_AA64MMFR2_EL1,
+		.sign = FTR_UNSIGNED,
+		.field_pos = ID_AA64MMFR2_TTL_SHIFT,
+		.min_field_value = 1,
+		.matches = has_cpuid_feature,
+	},
 #ifdef CONFIG_ARM64_HW_AFDBM
 	{
 		/*

From 6fcfdf6d72898d1c5118d7dd3d3d38690e2f6a64 Mon Sep 17 00:00:00 2001
From: Marc Zyngier <maz@kernel.org>
Date: Fri, 28 Dec 2018 09:11:50 +0000
Subject: [PATCH 275/502] arm64: Document SW reserved PTE/PMD bits in Stage-2
 descriptors

Advertise bits [58:55] as reserved for SW in the S2 descriptors.

Reviewed-by: Andrew Scull <ascull@google.com>
Acked-by: Catalin Marinas <catalin.marinas@arm.com>
Signed-off-by: Marc Zyngier <maz@kernel.org>
---
 arch/arm64/include/asm/pgtable-hwdef.h | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/arch/arm64/include/asm/pgtable-hwdef.h b/arch/arm64/include/asm/pgtable-hwdef.h
index 9c91a8f93a0e..de0b603955f4 100644
--- a/arch/arm64/include/asm/pgtable-hwdef.h
+++ b/arch/arm64/include/asm/pgtable-hwdef.h
@@ -178,10 +178,12 @@
 #define PTE_S2_RDONLY		(_AT(pteval_t, 1) << 6)   /* HAP[2:1] */
 #define PTE_S2_RDWR		(_AT(pteval_t, 3) << 6)   /* HAP[2:1] */
 #define PTE_S2_XN		(_AT(pteval_t, 2) << 53)  /* XN[1:0] */
+#define PTE_S2_SW_RESVD		(_AT(pteval_t, 15) << 55) /* Reserved for SW */
 
 #define PMD_S2_RDONLY		(_AT(pmdval_t, 1) << 6)   /* HAP[2:1] */
 #define PMD_S2_RDWR		(_AT(pmdval_t, 3) << 6)   /* HAP[2:1] */
 #define PMD_S2_XN		(_AT(pmdval_t, 2) << 53)  /* XN[1:0] */
+#define PMD_S2_SW_RESVD		(_AT(pmdval_t, 15) << 55) /* Reserved for SW */
 
 #define PUD_S2_RDONLY		(_AT(pudval_t, 1) << 6)   /* HAP[2:1] */
 #define PUD_S2_RDWR		(_AT(pudval_t, 3) << 6)   /* HAP[2:1] */

From c10bc62ae4d2135c9db40e96a8e994164faee531 Mon Sep 17 00:00:00 2001
From: Marc Zyngier <maz@kernel.org>
Date: Wed, 2 Jan 2019 10:21:29 +0000
Subject: [PATCH 276/502] arm64: Add level-hinted TLB invalidation helper

Add a level-hinted TLB invalidation helper that only gets used if
ARMv8.4-TTL gets detected.

Reviewed-by: Alexandru Elisei <alexandru.elisei@arm.com>
Reviewed-by: Catalin Marinas <catalin.marinas@arm.com>
Signed-off-by: Marc Zyngier <maz@kernel.org>
---
 arch/arm64/include/asm/stage2_pgtable.h |  9 +++++
 arch/arm64/include/asm/tlbflush.h       | 45 +++++++++++++++++++++++++
 2 files changed, 54 insertions(+)

diff --git a/arch/arm64/include/asm/stage2_pgtable.h b/arch/arm64/include/asm/stage2_pgtable.h
index b767904f28b1..996bf98f0cab 100644
--- a/arch/arm64/include/asm/stage2_pgtable.h
+++ b/arch/arm64/include/asm/stage2_pgtable.h
@@ -256,4 +256,13 @@ stage2_pgd_addr_end(struct kvm *kvm, phys_addr_t addr, phys_addr_t end)
 	return (boundary - 1 < end - 1) ? boundary : end;
 }
 
+/*
+ * Level values for the ARMv8.4-TTL extension, mapping PUD/PMD/PTE and
+ * the architectural page-table level.
+ */
+#define S2_NO_LEVEL_HINT	0
+#define S2_PUD_LEVEL		1
+#define S2_PMD_LEVEL		2
+#define S2_PTE_LEVEL		3
+
 #endif	/* __ARM64_S2_PGTABLE_H_ */
diff --git a/arch/arm64/include/asm/tlbflush.h b/arch/arm64/include/asm/tlbflush.h
index bc3949064725..3353f26302de 100644
--- a/arch/arm64/include/asm/tlbflush.h
+++ b/arch/arm64/include/asm/tlbflush.h
@@ -10,6 +10,7 @@
 
 #ifndef __ASSEMBLY__
 
+#include <linux/bitfield.h>
 #include <linux/mm_types.h>
 #include <linux/sched.h>
 #include <asm/cputype.h>
@@ -59,6 +60,50 @@
 		__ta;						\
 	})
 
+/*
+ * Level-based TLBI operations.
+ *
+ * When ARMv8.4-TTL exists, TLBI operations take an additional hint for
+ * the level at which the invalidation must take place. If the level is
+ * wrong, no invalidation may take place. In the case where the level
+ * cannot be easily determined, a 0 value for the level parameter will
+ * perform a non-hinted invalidation.
+ *
+ * For Stage-2 invalidation, use the level values provided to that effect
+ * in asm/stage2_pgtable.h.
+ */
+#define TLBI_TTL_MASK		GENMASK_ULL(47, 44)
+#define TLBI_TTL_TG_4K		1
+#define TLBI_TTL_TG_16K		2
+#define TLBI_TTL_TG_64K		3
+
+#define __tlbi_level(op, addr, level)					\
+	do {								\
+		u64 arg = addr;						\
+									\
+		if (cpus_have_const_cap(ARM64_HAS_ARMv8_4_TTL) &&	\
+		    level) {						\
+			u64 ttl = level & 3;				\
+									\
+			switch (PAGE_SIZE) {				\
+			case SZ_4K:					\
+				ttl |= TLBI_TTL_TG_4K << 2;		\
+				break;					\
+			case SZ_16K:					\
+				ttl |= TLBI_TTL_TG_16K << 2;		\
+				break;					\
+			case SZ_64K:					\
+				ttl |= TLBI_TTL_TG_64K << 2;		\
+				break;					\
+			}						\
+									\
+			arg &= ~TLBI_TTL_MASK;				\
+			arg |= FIELD_PREP(TLBI_TTL_MASK, ttl);		\
+		}							\
+									\
+		__tlbi(op, arg);					\
+	} while(0)
+
 /*
  *	TLB Invalidation
  *	================

From 7af928851508fb25207806f57e287272dd498981 Mon Sep 17 00:00:00 2001
From: Andrew Scull <ascull@google.com>
Date: Thu, 18 Jun 2020 15:55:11 +0100
Subject: [PATCH 277/502] smccc: Make constants available to assembly

Move constants out of the C-only section of the header next to the other
constants that are available to assembly.

Signed-off-by: Andrew Scull <ascull@google.com>
Reviewed-by: Sudeep Holla <sudeep.holla@arm.com>
Acked-by: Will Deacon <will@kernel.org>
Link: https://lore.kernel.org/r/20200618145511.69203-1-ascull@google.com
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 include/linux/arm-smccc.h | 44 +++++++++++++++++++--------------------
 1 file changed, 22 insertions(+), 22 deletions(-)

diff --git a/include/linux/arm-smccc.h b/include/linux/arm-smccc.h
index 56d6a5c6e353..efcbde731f03 100644
--- a/include/linux/arm-smccc.h
+++ b/include/linux/arm-smccc.h
@@ -81,6 +81,28 @@
 			   ARM_SMCCC_SMC_32,				\
 			   0, 0x7fff)
 
+/* Paravirtualised time calls (defined by ARM DEN0057A) */
+#define ARM_SMCCC_HV_PV_TIME_FEATURES				\
+	ARM_SMCCC_CALL_VAL(ARM_SMCCC_FAST_CALL,			\
+			   ARM_SMCCC_SMC_64,			\
+			   ARM_SMCCC_OWNER_STANDARD_HYP,	\
+			   0x20)
+
+#define ARM_SMCCC_HV_PV_TIME_ST					\
+	ARM_SMCCC_CALL_VAL(ARM_SMCCC_FAST_CALL,			\
+			   ARM_SMCCC_SMC_64,			\
+			   ARM_SMCCC_OWNER_STANDARD_HYP,	\
+			   0x21)
+
+/*
+ * Return codes defined in ARM DEN 0070A
+ * ARM DEN 0070A is now merged/consolidated into ARM DEN 0028 C
+ */
+#define SMCCC_RET_SUCCESS			0
+#define SMCCC_RET_NOT_SUPPORTED			-1
+#define SMCCC_RET_NOT_REQUIRED			-2
+#define SMCCC_RET_INVALID_PARAMETER		-3
+
 #ifndef __ASSEMBLY__
 
 #include <linux/linkage.h>
@@ -331,15 +353,6 @@ asmlinkage void __arm_smccc_hvc(unsigned long a0, unsigned long a1,
  */
 #define arm_smccc_1_1_hvc(...)	__arm_smccc_1_1(SMCCC_HVC_INST, __VA_ARGS__)
 
-/*
- * Return codes defined in ARM DEN 0070A
- * ARM DEN 0070A is now merged/consolidated into ARM DEN 0028 C
- */
-#define SMCCC_RET_SUCCESS			0
-#define SMCCC_RET_NOT_SUPPORTED			-1
-#define SMCCC_RET_NOT_REQUIRED			-2
-#define SMCCC_RET_INVALID_PARAMETER		-3
-
 /*
  * Like arm_smccc_1_1* but always returns SMCCC_RET_NOT_SUPPORTED.
  * Used when the SMCCC conduit is not defined. The empty asm statement
@@ -385,18 +398,5 @@ asmlinkage void __arm_smccc_hvc(unsigned long a0, unsigned long a1,
 		method;							\
 	})
 
-/* Paravirtualised time calls (defined by ARM DEN0057A) */
-#define ARM_SMCCC_HV_PV_TIME_FEATURES				\
-	ARM_SMCCC_CALL_VAL(ARM_SMCCC_FAST_CALL,			\
-			   ARM_SMCCC_SMC_64,			\
-			   ARM_SMCCC_OWNER_STANDARD_HYP,	\
-			   0x20)
-
-#define ARM_SMCCC_HV_PV_TIME_ST					\
-	ARM_SMCCC_CALL_VAL(ARM_SMCCC_FAST_CALL,			\
-			   ARM_SMCCC_SMC_64,			\
-			   ARM_SMCCC_OWNER_STANDARD_HYP,	\
-			   0x21)
-
 #endif /*__ASSEMBLY__*/
 #endif /*__LINUX_ARM_SMCCC_H*/

From e735b98a5fe08c0f50f9fdc3e3a844e3638e6649 Mon Sep 17 00:00:00 2001
From: Zhenyu Ye <yezhenyu2@huawei.com>
Date: Thu, 25 Jun 2020 16:03:11 +0800
Subject: [PATCH 278/502] arm64: Add tlbi_user_level TLB invalidation helper

Add a level-hinted parameter to __tlbi_user, which only gets used
if ARMv8.4-TTL gets detected.

ARMv8.4-TTL provides the TTL field in tlbi instruction to indicate
the level of translation table walk holding the leaf entry for the
address that is being invalidated.

This patch set the default level value of flush_tlb_range() to 0,
which will be updated in future patches.  And set the ttl value of
flush_tlb_page_nosync() to 3 because it is only called to flush a
single pte page.

Signed-off-by: Zhenyu Ye <yezhenyu2@huawei.com>
Link: https://lore.kernel.org/r/20200625080314.230-4-yezhenyu2@huawei.com
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 arch/arm64/include/asm/tlbflush.h | 18 ++++++++++++------
 1 file changed, 12 insertions(+), 6 deletions(-)

diff --git a/arch/arm64/include/asm/tlbflush.h b/arch/arm64/include/asm/tlbflush.h
index 3353f26302de..e1d07612e147 100644
--- a/arch/arm64/include/asm/tlbflush.h
+++ b/arch/arm64/include/asm/tlbflush.h
@@ -104,6 +104,11 @@
 		__tlbi(op, arg);					\
 	} while(0)
 
+#define __tlbi_user_level(op, arg, level) do {				\
+	if (arm64_kernel_unmapped_at_el0())				\
+		__tlbi_level(op, (arg | USER_ASID_FLAG), level);	\
+} while (0)
+
 /*
  *	TLB Invalidation
  *	================
@@ -205,8 +210,9 @@ static inline void flush_tlb_page_nosync(struct vm_area_struct *vma,
 	unsigned long addr = __TLBI_VADDR(uaddr, ASID(vma->vm_mm));
 
 	dsb(ishst);
-	__tlbi(vale1is, addr);
-	__tlbi_user(vale1is, addr);
+	/* This function is only called on a small page */
+	__tlbi_level(vale1is, addr, 3);
+	__tlbi_user_level(vale1is, addr, 3);
 }
 
 static inline void flush_tlb_page(struct vm_area_struct *vma,
@@ -246,11 +252,11 @@ static inline void __flush_tlb_range(struct vm_area_struct *vma,
 	dsb(ishst);
 	for (addr = start; addr < end; addr += stride) {
 		if (last_level) {
-			__tlbi(vale1is, addr);
-			__tlbi_user(vale1is, addr);
+			__tlbi_level(vale1is, addr, 0);
+			__tlbi_user_level(vale1is, addr, 0);
 		} else {
-			__tlbi(vae1is, addr);
-			__tlbi_user(vae1is, addr);
+			__tlbi_level(vae1is, addr, 0);
+			__tlbi_user_level(vae1is, addr, 0);
 		}
 	}
 	dsb(ish);

From 2631ed00b0498810f8d5c2163c6b5270d893687b Mon Sep 17 00:00:00 2001
From: "Peter Zijlstra (Intel)" <peterz@infradead.org>
Date: Thu, 25 Jun 2020 16:03:12 +0800
Subject: [PATCH 279/502] tlb: mmu_gather: add tlb_flush_*_range APIs

tlb_flush_{pte|pmd|pud|p4d}_range() adjust the tlb->start and
tlb->end, then set corresponding cleared_*.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Zhenyu Ye <yezhenyu2@huawei.com>
Acked-by: Catalin Marinas <catalin.marinas@arm.com>
Link: https://lore.kernel.org/r/20200625080314.230-5-yezhenyu2@huawei.com
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 include/asm-generic/tlb.h | 55 ++++++++++++++++++++++++++++-----------
 1 file changed, 40 insertions(+), 15 deletions(-)

diff --git a/include/asm-generic/tlb.h b/include/asm-generic/tlb.h
index 3f1649a8cf55..ef75ec86f865 100644
--- a/include/asm-generic/tlb.h
+++ b/include/asm-generic/tlb.h
@@ -512,6 +512,38 @@ static inline void tlb_end_vma(struct mmu_gather *tlb, struct vm_area_struct *vm
 }
 #endif
 
+/*
+ * tlb_flush_{pte|pmd|pud|p4d}_range() adjust the tlb->start and tlb->end,
+ * and set corresponding cleared_*.
+ */
+static inline void tlb_flush_pte_range(struct mmu_gather *tlb,
+				     unsigned long address, unsigned long size)
+{
+	__tlb_adjust_range(tlb, address, size);
+	tlb->cleared_ptes = 1;
+}
+
+static inline void tlb_flush_pmd_range(struct mmu_gather *tlb,
+				     unsigned long address, unsigned long size)
+{
+	__tlb_adjust_range(tlb, address, size);
+	tlb->cleared_pmds = 1;
+}
+
+static inline void tlb_flush_pud_range(struct mmu_gather *tlb,
+				     unsigned long address, unsigned long size)
+{
+	__tlb_adjust_range(tlb, address, size);
+	tlb->cleared_puds = 1;
+}
+
+static inline void tlb_flush_p4d_range(struct mmu_gather *tlb,
+				     unsigned long address, unsigned long size)
+{
+	__tlb_adjust_range(tlb, address, size);
+	tlb->cleared_p4ds = 1;
+}
+
 #ifndef __tlb_remove_tlb_entry
 #define __tlb_remove_tlb_entry(tlb, ptep, address) do { } while (0)
 #endif
@@ -525,19 +557,17 @@ static inline void tlb_end_vma(struct mmu_gather *tlb, struct vm_area_struct *vm
  */
 #define tlb_remove_tlb_entry(tlb, ptep, address)		\
 	do {							\
-		__tlb_adjust_range(tlb, address, PAGE_SIZE);	\
-		tlb->cleared_ptes = 1;				\
+		tlb_flush_pte_range(tlb, address, PAGE_SIZE);	\
 		__tlb_remove_tlb_entry(tlb, ptep, address);	\
 	} while (0)
 
 #define tlb_remove_huge_tlb_entry(h, tlb, ptep, address)	\
 	do {							\
 		unsigned long _sz = huge_page_size(h);		\
-		__tlb_adjust_range(tlb, address, _sz);		\
 		if (_sz == PMD_SIZE)				\
-			tlb->cleared_pmds = 1;			\
+			tlb_flush_pmd_range(tlb, address, _sz);	\
 		else if (_sz == PUD_SIZE)			\
-			tlb->cleared_puds = 1;			\
+			tlb_flush_pud_range(tlb, address, _sz);	\
 		__tlb_remove_tlb_entry(tlb, ptep, address);	\
 	} while (0)
 
@@ -551,8 +581,7 @@ static inline void tlb_end_vma(struct mmu_gather *tlb, struct vm_area_struct *vm
 
 #define tlb_remove_pmd_tlb_entry(tlb, pmdp, address)			\
 	do {								\
-		__tlb_adjust_range(tlb, address, HPAGE_PMD_SIZE);	\
-		tlb->cleared_pmds = 1;					\
+		tlb_flush_pmd_range(tlb, address, HPAGE_PMD_SIZE);	\
 		__tlb_remove_pmd_tlb_entry(tlb, pmdp, address);		\
 	} while (0)
 
@@ -566,8 +595,7 @@ static inline void tlb_end_vma(struct mmu_gather *tlb, struct vm_area_struct *vm
 
 #define tlb_remove_pud_tlb_entry(tlb, pudp, address)			\
 	do {								\
-		__tlb_adjust_range(tlb, address, HPAGE_PUD_SIZE);	\
-		tlb->cleared_puds = 1;					\
+		tlb_flush_pud_range(tlb, address, HPAGE_PUD_SIZE);	\
 		__tlb_remove_pud_tlb_entry(tlb, pudp, address);		\
 	} while (0)
 
@@ -592,9 +620,8 @@ static inline void tlb_end_vma(struct mmu_gather *tlb, struct vm_area_struct *vm
 #ifndef pte_free_tlb
 #define pte_free_tlb(tlb, ptep, address)			\
 	do {							\
-		__tlb_adjust_range(tlb, address, PAGE_SIZE);	\
+		tlb_flush_pmd_range(tlb, address, PAGE_SIZE);	\
 		tlb->freed_tables = 1;				\
-		tlb->cleared_pmds = 1;				\
 		__pte_free_tlb(tlb, ptep, address);		\
 	} while (0)
 #endif
@@ -602,9 +629,8 @@ static inline void tlb_end_vma(struct mmu_gather *tlb, struct vm_area_struct *vm
 #ifndef pmd_free_tlb
 #define pmd_free_tlb(tlb, pmdp, address)			\
 	do {							\
-		__tlb_adjust_range(tlb, address, PAGE_SIZE);	\
+		tlb_flush_pud_range(tlb, address, PAGE_SIZE);	\
 		tlb->freed_tables = 1;				\
-		tlb->cleared_puds = 1;				\
 		__pmd_free_tlb(tlb, pmdp, address);		\
 	} while (0)
 #endif
@@ -612,9 +638,8 @@ static inline void tlb_end_vma(struct mmu_gather *tlb, struct vm_area_struct *vm
 #ifndef pud_free_tlb
 #define pud_free_tlb(tlb, pudp, address)			\
 	do {							\
-		__tlb_adjust_range(tlb, address, PAGE_SIZE);	\
+		tlb_flush_p4d_range(tlb, address, PAGE_SIZE);	\
 		tlb->freed_tables = 1;				\
-		tlb->cleared_p4ds = 1;				\
 		__pud_free_tlb(tlb, pudp, address);		\
 	} while (0)
 #endif

From c4ab2cbc1d8768eb505708a58c54c277dfe4a93d Mon Sep 17 00:00:00 2001
From: Zhenyu Ye <yezhenyu2@huawei.com>
Date: Thu, 25 Jun 2020 16:03:13 +0800
Subject: [PATCH 280/502] arm64: tlb: Set the TTL field in flush_tlb_range

This patch uses the cleared_* in struct mmu_gather to set the
TTL field in flush_tlb_range().

Signed-off-by: Zhenyu Ye <yezhenyu2@huawei.com>
Reviewed-by: Catalin Marinas <catalin.marinas@arm.com>
Link: https://lore.kernel.org/r/20200625080314.230-6-yezhenyu2@huawei.com
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 arch/arm64/include/asm/tlb.h      | 29 ++++++++++++++++++++++++++++-
 arch/arm64/include/asm/tlbflush.h | 14 ++++++++------
 2 files changed, 36 insertions(+), 7 deletions(-)

diff --git a/arch/arm64/include/asm/tlb.h b/arch/arm64/include/asm/tlb.h
index b76df828e6b7..61c97d3b58c7 100644
--- a/arch/arm64/include/asm/tlb.h
+++ b/arch/arm64/include/asm/tlb.h
@@ -21,11 +21,37 @@ static void tlb_flush(struct mmu_gather *tlb);
 
 #include <asm-generic/tlb.h>
 
+/*
+ * get the tlbi levels in arm64.  Default value is 0 if more than one
+ * of cleared_* is set or neither is set.
+ * Arm64 doesn't support p4ds now.
+ */
+static inline int tlb_get_level(struct mmu_gather *tlb)
+{
+	if (tlb->cleared_ptes && !(tlb->cleared_pmds ||
+				   tlb->cleared_puds ||
+				   tlb->cleared_p4ds))
+		return 3;
+
+	if (tlb->cleared_pmds && !(tlb->cleared_ptes ||
+				   tlb->cleared_puds ||
+				   tlb->cleared_p4ds))
+		return 2;
+
+	if (tlb->cleared_puds && !(tlb->cleared_ptes ||
+				   tlb->cleared_pmds ||
+				   tlb->cleared_p4ds))
+		return 1;
+
+	return 0;
+}
+
 static inline void tlb_flush(struct mmu_gather *tlb)
 {
 	struct vm_area_struct vma = TLB_FLUSH_VMA(tlb->mm, 0);
 	bool last_level = !tlb->freed_tables;
 	unsigned long stride = tlb_get_unmap_size(tlb);
+	int tlb_level = tlb_get_level(tlb);
 
 	/*
 	 * If we're tearing down the address space then we only care about
@@ -38,7 +64,8 @@ static inline void tlb_flush(struct mmu_gather *tlb)
 		return;
 	}
 
-	__flush_tlb_range(&vma, tlb->start, tlb->end, stride, last_level);
+	__flush_tlb_range(&vma, tlb->start, tlb->end, stride,
+			  last_level, tlb_level);
 }
 
 static inline void __pte_free_tlb(struct mmu_gather *tlb, pgtable_t pte,
diff --git a/arch/arm64/include/asm/tlbflush.h b/arch/arm64/include/asm/tlbflush.h
index e1d07612e147..3505f6fbfca3 100644
--- a/arch/arm64/include/asm/tlbflush.h
+++ b/arch/arm64/include/asm/tlbflush.h
@@ -230,7 +230,8 @@ static inline void flush_tlb_page(struct vm_area_struct *vma,
 
 static inline void __flush_tlb_range(struct vm_area_struct *vma,
 				     unsigned long start, unsigned long end,
-				     unsigned long stride, bool last_level)
+				     unsigned long stride, bool last_level,
+				     int tlb_level)
 {
 	unsigned long asid = ASID(vma->vm_mm);
 	unsigned long addr;
@@ -252,11 +253,11 @@ static inline void __flush_tlb_range(struct vm_area_struct *vma,
 	dsb(ishst);
 	for (addr = start; addr < end; addr += stride) {
 		if (last_level) {
-			__tlbi_level(vale1is, addr, 0);
-			__tlbi_user_level(vale1is, addr, 0);
+			__tlbi_level(vale1is, addr, tlb_level);
+			__tlbi_user_level(vale1is, addr, tlb_level);
 		} else {
-			__tlbi_level(vae1is, addr, 0);
-			__tlbi_user_level(vae1is, addr, 0);
+			__tlbi_level(vae1is, addr, tlb_level);
+			__tlbi_user_level(vae1is, addr, tlb_level);
 		}
 	}
 	dsb(ish);
@@ -268,8 +269,9 @@ static inline void flush_tlb_range(struct vm_area_struct *vma,
 	/*
 	 * We cannot use leaf-only invalidation here, since we may be invalidating
 	 * table entries as part of collapsing hugepages or moving page tables.
+	 * Set the tlb_level to 0 because we can not get enough information here.
 	 */
-	__flush_tlb_range(vma, start, end, PAGE_SIZE, false);
+	__flush_tlb_range(vma, start, end, PAGE_SIZE, false, 0);
 }
 
 static inline void flush_tlb_kernel_range(unsigned long start, unsigned long end)

From a7ac1cfa4c0510217e74c2ba807ead549f80d82c Mon Sep 17 00:00:00 2001
From: Zhenyu Ye <yezhenyu2@huawei.com>
Date: Thu, 25 Jun 2020 16:03:14 +0800
Subject: [PATCH 281/502] arm64: tlb: Set the TTL field in flush_*_tlb_range

This patch implement flush_{pmd|pud}_tlb_range() in arm64 by
calling __flush_tlb_range() with the corresponding stride and
tlb_level values.

Signed-off-by: Zhenyu Ye <yezhenyu2@huawei.com>
Link: https://lore.kernel.org/r/20200625080314.230-7-yezhenyu2@huawei.com
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 arch/arm64/include/asm/pgtable.h | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h
index 758e2d1577d0..d5d3fbe73953 100644
--- a/arch/arm64/include/asm/pgtable.h
+++ b/arch/arm64/include/asm/pgtable.h
@@ -40,6 +40,16 @@ extern void __pmd_error(const char *file, int line, unsigned long val);
 extern void __pud_error(const char *file, int line, unsigned long val);
 extern void __pgd_error(const char *file, int line, unsigned long val);
 
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+#define __HAVE_ARCH_FLUSH_PMD_TLB_RANGE
+
+/* Set stride and tlb_level in flush_*_tlb_range */
+#define flush_pmd_tlb_range(vma, addr, end)	\
+	__flush_tlb_range(vma, addr, end, PMD_SIZE, false, 2)
+#define flush_pud_tlb_range(vma, addr, end)	\
+	__flush_tlb_range(vma, addr, end, PUD_SIZE, false, 1)
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
+
 /*
  * ZERO_PAGE is a global shared page that is always zero: used
  * for zero-mapped memory areas etc..

From 34e36d81a0ef76047fa12a0f8e0dce4369b435cf Mon Sep 17 00:00:00 2001
From: Catalin Marinas <catalin.marinas@arm.com>
Date: Tue, 7 Jul 2020 11:26:14 +0100
Subject: [PATCH 282/502] arm64: Shift the __tlbi_level() indentation left

This is for consistency with the other __tlbi macros in this file. No
functional change.

Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 arch/arm64/include/asm/tlbflush.h | 43 +++++++++++++++----------------
 1 file changed, 21 insertions(+), 22 deletions(-)

diff --git a/arch/arm64/include/asm/tlbflush.h b/arch/arm64/include/asm/tlbflush.h
index 3505f6fbfca3..39aed2efd21b 100644
--- a/arch/arm64/include/asm/tlbflush.h
+++ b/arch/arm64/include/asm/tlbflush.h
@@ -77,32 +77,31 @@
 #define TLBI_TTL_TG_16K		2
 #define TLBI_TTL_TG_64K		3
 
-#define __tlbi_level(op, addr, level)					\
-	do {								\
-		u64 arg = addr;						\
+#define __tlbi_level(op, addr, level) do {				\
+	u64 arg = addr;							\
 									\
-		if (cpus_have_const_cap(ARM64_HAS_ARMv8_4_TTL) &&	\
-		    level) {						\
-			u64 ttl = level & 3;				\
+	if (cpus_have_const_cap(ARM64_HAS_ARMv8_4_TTL) &&		\
+	    level) {							\
+		u64 ttl = level & 3;					\
 									\
-			switch (PAGE_SIZE) {				\
-			case SZ_4K:					\
-				ttl |= TLBI_TTL_TG_4K << 2;		\
-				break;					\
-			case SZ_16K:					\
-				ttl |= TLBI_TTL_TG_16K << 2;		\
-				break;					\
-			case SZ_64K:					\
-				ttl |= TLBI_TTL_TG_64K << 2;		\
-				break;					\
-			}						\
-									\
-			arg &= ~TLBI_TTL_MASK;				\
-			arg |= FIELD_PREP(TLBI_TTL_MASK, ttl);		\
+		switch (PAGE_SIZE) {					\
+		case SZ_4K:						\
+			ttl |= TLBI_TTL_TG_4K << 2;			\
+			break;						\
+		case SZ_16K:						\
+			ttl |= TLBI_TTL_TG_16K << 2;			\
+			break;						\
+		case SZ_64K:						\
+			ttl |= TLBI_TTL_TG_64K << 2;			\
+			break;						\
 		}							\
 									\
-		__tlbi(op, arg);					\
-	} while(0)
+		arg &= ~TLBI_TTL_MASK;					\
+		arg |= FIELD_PREP(TLBI_TTL_MASK, ttl);			\
+	}								\
+									\
+	__tlbi(op, arg);						\
+} while(0)
 
 #define __tlbi_user_level(op, arg, level) do {				\
 	if (arm64_kernel_unmapped_at_el0())				\

From c6c83d757a13a5df51428a6fe133c9193810507b Mon Sep 17 00:00:00 2001
From: Anshuman Khandual <anshuman.khandual@arm.com>
Date: Tue, 7 Jul 2020 19:53:13 +0530
Subject: [PATCH 283/502] arm64/cpufeature: Validate feature bits spacing in
 arm64_ftr_regs[]

arm64_feature_bits for a register in arm64_ftr_regs[] are in a descending
order as per their shift values. Validate that these features bits are
defined correctly and do not overlap with each other. This check protects
against any inadvertent erroneous changes to the register definitions.

Signed-off-by: Anshuman Khandual <anshuman.khandual@arm.com>
Reviewed-by: Suzuki K Poulose <suzuki.poulose@arm.com>
Cc: Will Deacon <will@kernel.org>
Cc: Suzuki K Poulose <suzuki.poulose@arm.com>
Cc: Mark Brown <broonie@kernel.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: linux-arm-kernel@lists.infradead.org
Cc: linux-kernel@vger.kernel.org
Link: https://lore.kernel.org/r/1594131793-9498-1-git-send-email-anshuman.khandual@arm.com
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 arch/arm64/kernel/cpufeature.c | 47 +++++++++++++++++++++++++++++++---
 1 file changed, 44 insertions(+), 3 deletions(-)

diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c
index 19146bd338b4..d9b51cb9cb8c 100644
--- a/arch/arm64/kernel/cpufeature.c
+++ b/arch/arm64/kernel/cpufeature.c
@@ -712,11 +712,52 @@ static s64 arm64_ftr_safe_value(const struct arm64_ftr_bits *ftrp, s64 new,
 
 static void __init sort_ftr_regs(void)
 {
-	int i;
+	unsigned int i;
 
-	/* Check that the array is sorted so that we can do the binary search */
-	for (i = 1; i < ARRAY_SIZE(arm64_ftr_regs); i++)
+	for (i = 0; i < ARRAY_SIZE(arm64_ftr_regs); i++) {
+		const struct arm64_ftr_reg *ftr_reg = arm64_ftr_regs[i].reg;
+		const struct arm64_ftr_bits *ftr_bits = ftr_reg->ftr_bits;
+		unsigned int j = 0;
+
+		/*
+		 * Features here must be sorted in descending order with respect
+		 * to their shift values and should not overlap with each other.
+		 */
+		for (; ftr_bits->width != 0; ftr_bits++, j++) {
+			unsigned int width = ftr_reg->ftr_bits[j].width;
+			unsigned int shift = ftr_reg->ftr_bits[j].shift;
+			unsigned int prev_shift;
+
+			WARN((shift  + width) > 64,
+				"%s has invalid feature at shift %d\n",
+				ftr_reg->name, shift);
+
+			/*
+			 * Skip the first feature. There is nothing to
+			 * compare against for now.
+			 */
+			if (j == 0)
+				continue;
+
+			prev_shift = ftr_reg->ftr_bits[j - 1].shift;
+			WARN((shift + width) > prev_shift,
+				"%s has feature overlap at shift %d\n",
+				ftr_reg->name, shift);
+		}
+
+		/*
+		 * Skip the first register. There is nothing to
+		 * compare against for now.
+		 */
+		if (i == 0)
+			continue;
+		/*
+		 * Registers here must be sorted in ascending order with respect
+		 * to sys_id for subsequent binary search in get_arm64_ftr_reg()
+		 * to work correctly.
+		 */
 		BUG_ON(arm64_ftr_regs[i].sys_id < arm64_ftr_regs[i - 1].sys_id);
+	}
 }
 
 /*

From 9dedd56301564acdbb1dd37cf09250a4c7b783c9 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Tue, 7 Jul 2020 16:36:20 +0300
Subject: [PATCH 284/502] io_uring: partially inline io_iopoll_getevents()

io_iopoll_reap_events() doesn't care about returned valued of
io_iopoll_getevents() and does the same checks for list emptiness
and need_resched(). Just use io_do_iopoll().

io_sq_thread() doesn't check return value as well. It also passes min=0,
so there never be the second iteration inside io_poll_getevents().
Inline it there too.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 6e3169834bf7..104af675f6fb 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -2064,7 +2064,7 @@ static void io_iopoll_reap_events(struct io_ring_ctx *ctx)
 	while (!list_empty(&ctx->poll_list)) {
 		unsigned int nr_events = 0;
 
-		io_iopoll_getevents(ctx, &nr_events, 1);
+		io_do_iopoll(ctx, &nr_events, 1);
 
 		/*
 		 * Ensure we allow local-to-the-cpu processing to take place,
@@ -6318,8 +6318,8 @@ static int io_sq_thread(void *data)
 			unsigned nr_events = 0;
 
 			mutex_lock(&ctx->uring_lock);
-			if (!list_empty(&ctx->poll_list))
-				io_iopoll_getevents(ctx, &nr_events, 0);
+			if (!list_empty(&ctx->poll_list) && !need_resched())
+				io_do_iopoll(ctx, &nr_events, 0);
 			else
 				timeout = jiffies + ctx->sq_thread_idle;
 			mutex_unlock(&ctx->uring_lock);

From 7668b92a69b8201e2dd16a47a08efb93e909f419 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Tue, 7 Jul 2020 16:36:21 +0300
Subject: [PATCH 285/502] io_uring: remove nr_events arg from iopoll_check()

Nobody checks io_iopoll_check()'s output parameter @nr_events.
Remove the parameter and declare it further down the stack.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 12 +++++-------
 1 file changed, 5 insertions(+), 7 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 104af675f6fb..38bf42320f56 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -2080,9 +2080,9 @@ static void io_iopoll_reap_events(struct io_ring_ctx *ctx)
 	mutex_unlock(&ctx->uring_lock);
 }
 
-static int io_iopoll_check(struct io_ring_ctx *ctx, unsigned *nr_events,
-			   long min)
+static int io_iopoll_check(struct io_ring_ctx *ctx, long min)
 {
+	unsigned int nr_events = 0;
 	int iters = 0, ret = 0;
 
 	/*
@@ -2116,11 +2116,11 @@ static int io_iopoll_check(struct io_ring_ctx *ctx, unsigned *nr_events,
 			mutex_lock(&ctx->uring_lock);
 		}
 
-		ret = io_iopoll_getevents(ctx, nr_events, min);
+		ret = io_iopoll_getevents(ctx, &nr_events, min);
 		if (ret <= 0)
 			break;
 		ret = 0;
-	} while (min && !*nr_events && !need_resched());
+	} while (min && !nr_events && !need_resched());
 
 	mutex_unlock(&ctx->uring_lock);
 	return ret;
@@ -7977,8 +7977,6 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit,
 			goto out;
 	}
 	if (flags & IORING_ENTER_GETEVENTS) {
-		unsigned nr_events = 0;
-
 		min_complete = min(min_complete, ctx->cq_entries);
 
 		/*
@@ -7989,7 +7987,7 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit,
 		 */
 		if (ctx->flags & IORING_SETUP_IOPOLL &&
 		    !(ctx->flags & IORING_SETUP_SQPOLL)) {
-			ret = io_iopoll_check(ctx, &nr_events, min_complete);
+			ret = io_iopoll_check(ctx, min_complete);
 		} else {
 			ret = io_cqring_wait(ctx, min_complete, sig, sigsz);
 		}

From b2edc0a77fac19bbdef63cedb2ea34aec1a9a499 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Tue, 7 Jul 2020 16:36:22 +0300
Subject: [PATCH 286/502] io_uring: don't burn CPU for iopoll on exit

First of all don't spin in io_ring_ctx_wait_and_kill() on iopoll.
Requests won't complete faster because of that, but only lengthen
io_uring_release().

The same goes for offloaded cleanup in io_ring_exit_work() -- it
already has waiting loop, don't do blocking active spinning.

For that, pass min=0 into io_iopoll_[try_]reap_events(), so it won't
actively spin. Leave the function if io_do_iopoll() there can't
complete a request to sleep in io_ring_exit_work().

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 23 +++++++++++------------
 1 file changed, 11 insertions(+), 12 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 38bf42320f56..4c9a494c9f9f 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -2055,7 +2055,7 @@ static int io_iopoll_getevents(struct io_ring_ctx *ctx, unsigned int *nr_events,
  * We can't just wait for polled events to come to us, we have to actively
  * find and complete them.
  */
-static void io_iopoll_reap_events(struct io_ring_ctx *ctx)
+static void io_iopoll_try_reap_events(struct io_ring_ctx *ctx)
 {
 	if (!(ctx->flags & IORING_SETUP_IOPOLL))
 		return;
@@ -2064,8 +2064,11 @@ static void io_iopoll_reap_events(struct io_ring_ctx *ctx)
 	while (!list_empty(&ctx->poll_list)) {
 		unsigned int nr_events = 0;
 
-		io_do_iopoll(ctx, &nr_events, 1);
+		io_do_iopoll(ctx, &nr_events, 0);
 
+		/* let it sleep and repeat later if can't complete a request */
+		if (nr_events == 0)
+			break;
 		/*
 		 * Ensure we allow local-to-the-cpu processing to take place,
 		 * in this case we need to ensure that we reap all events.
@@ -7648,7 +7651,6 @@ static void io_ring_ctx_free(struct io_ring_ctx *ctx)
 		ctx->sqo_mm = NULL;
 	}
 
-	io_iopoll_reap_events(ctx);
 	io_sqe_buffer_unregister(ctx);
 	io_sqe_files_unregister(ctx);
 	io_eventfd_unregister(ctx);
@@ -7715,11 +7717,8 @@ static int io_remove_personalities(int id, void *p, void *data)
 
 static void io_ring_exit_work(struct work_struct *work)
 {
-	struct io_ring_ctx *ctx;
-
-	ctx = container_of(work, struct io_ring_ctx, exit_work);
-	if (ctx->rings)
-		io_cqring_overflow_flush(ctx, true);
+	struct io_ring_ctx *ctx = container_of(work, struct io_ring_ctx,
+					       exit_work);
 
 	/*
 	 * If we're doing polled IO and end up having requests being
@@ -7727,11 +7726,11 @@ static void io_ring_exit_work(struct work_struct *work)
 	 * we're waiting for refs to drop. We need to reap these manually,
 	 * as nobody else will be looking for them.
 	 */
-	while (!wait_for_completion_timeout(&ctx->ref_comp, HZ/20)) {
-		io_iopoll_reap_events(ctx);
+	do {
 		if (ctx->rings)
 			io_cqring_overflow_flush(ctx, true);
-	}
+		io_iopoll_try_reap_events(ctx);
+	} while (!wait_for_completion_timeout(&ctx->ref_comp, HZ/20));
 	io_ring_ctx_free(ctx);
 }
 
@@ -7747,10 +7746,10 @@ static void io_ring_ctx_wait_and_kill(struct io_ring_ctx *ctx)
 	if (ctx->io_wq)
 		io_wq_cancel_all(ctx->io_wq);
 
-	io_iopoll_reap_events(ctx);
 	/* if we failed setting up the ctx, we might not have any rings */
 	if (ctx->rings)
 		io_cqring_overflow_flush(ctx, true);
+	io_iopoll_try_reap_events(ctx);
 	idr_for_each(&ctx->personality_idr, io_remove_personalities, ctx);
 	INIT_WORK(&ctx->exit_work, io_ring_exit_work);
 	queue_work(system_wq, &ctx->exit_work);

From bd657aa3dd8514e62486ce7f90b5e484c18d684d Mon Sep 17 00:00:00 2001
From: Kan Liang <kan.liang@linux.intel.com>
Date: Fri, 3 Jul 2020 05:49:07 -0700
Subject: [PATCH 287/502] x86/cpufeatures: Add Architectural LBRs feature bit

CPUID.(EAX=07H, ECX=0):EDX[19] indicates whether an Intel CPU supports
Architectural LBRs.

The "X86_FEATURE_..., word 18" is already mirrored from CPUID
"0x00000007:0 (EDX)". Add X86_FEATURE_ARCH_LBR under the "word 18"
section.

The feature will appear as "arch_lbr" in /proc/cpuinfo.

The Architectural Last Branch Records (LBR) feature enables recording
of software path history by logging taken branches and other control
flows. The feature will be supported in the perf_events subsystem.

Signed-off-by: Kan Liang <kan.liang@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Dave Hansen <dave.hansen@intel.com>
Link: https://lkml.kernel.org/r/1593780569-62993-2-git-send-email-kan.liang@linux.intel.com
---
 arch/x86/include/asm/cpufeatures.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h
index 02dabc9e77b0..72ba4c59ad05 100644
--- a/arch/x86/include/asm/cpufeatures.h
+++ b/arch/x86/include/asm/cpufeatures.h
@@ -366,6 +366,7 @@
 #define X86_FEATURE_MD_CLEAR		(18*32+10) /* VERW clears CPU buffers */
 #define X86_FEATURE_TSX_FORCE_ABORT	(18*32+13) /* "" TSX_FORCE_ABORT */
 #define X86_FEATURE_PCONFIG		(18*32+18) /* Intel PCONFIG */
+#define X86_FEATURE_ARCH_LBR		(18*32+19) /* Intel ARCH LBR */
 #define X86_FEATURE_SPEC_CTRL		(18*32+26) /* "" Speculation Control (IBRS + IBPB) */
 #define X86_FEATURE_INTEL_STIBP		(18*32+27) /* "" Single Thread Indirect Branch Predictors */
 #define X86_FEATURE_FLUSH_L1D		(18*32+28) /* Flush L1D cache */

From 9f354a726cb1d4eb00a0784a27eaa0a3283cff71 Mon Sep 17 00:00:00 2001
From: Kan Liang <kan.liang@linux.intel.com>
Date: Fri, 3 Jul 2020 05:49:08 -0700
Subject: [PATCH 288/502] perf/x86/intel/lbr: Add a function pointer for LBR
 reset

The method to reset Architectural LBRs is different from previous
model-specific LBR. Perf has to implement a different function.

A function pointer is introduced for LBR reset. The enum of
LBR_FORMAT_* is also moved to perf_event.h. Perf should initialize the
corresponding functions at boot time, and avoid checking lbr_format at
run time.

The current 64-bit LBR reset function is set as default.

Signed-off-by: Kan Liang <kan.liang@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/1593780569-62993-3-git-send-email-kan.liang@linux.intel.com
---
 arch/x86/events/intel/core.c |  7 +++++++
 arch/x86/events/intel/lbr.c  | 20 +++-----------------
 arch/x86/events/perf_event.h | 17 +++++++++++++++++
 3 files changed, 27 insertions(+), 17 deletions(-)

diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c
index 582ddff9a359..fe49e99e4fbf 100644
--- a/arch/x86/events/intel/core.c
+++ b/arch/x86/events/intel/core.c
@@ -3978,6 +3978,8 @@ static __initconst const struct x86_pmu core_pmu = {
 	.cpu_dead		= intel_pmu_cpu_dead,
 
 	.check_period		= intel_pmu_check_period,
+
+	.lbr_reset		= intel_pmu_lbr_reset_64,
 };
 
 static __initconst const struct x86_pmu intel_pmu = {
@@ -4023,6 +4025,8 @@ static __initconst const struct x86_pmu intel_pmu = {
 	.check_period		= intel_pmu_check_period,
 
 	.aux_output_match	= intel_pmu_aux_output_match,
+
+	.lbr_reset		= intel_pmu_lbr_reset_64,
 };
 
 static __init void intel_clovertown_quirk(void)
@@ -4649,6 +4653,9 @@ __init int intel_pmu_init(void)
 		x86_pmu.intel_cap.capabilities = capabilities;
 	}
 
+	if (x86_pmu.intel_cap.lbr_format == LBR_FORMAT_32)
+		x86_pmu.lbr_reset = intel_pmu_lbr_reset_32;
+
 	intel_ds_init();
 
 	x86_add_quirk(intel_arch_events_quirk); /* Install first, so it runs last */
diff --git a/arch/x86/events/intel/lbr.c b/arch/x86/events/intel/lbr.c
index d03de7539957..7af27a766002 100644
--- a/arch/x86/events/intel/lbr.c
+++ b/arch/x86/events/intel/lbr.c
@@ -8,17 +8,6 @@
 
 #include "../perf_event.h"
 
-enum {
-	LBR_FORMAT_32		= 0x00,
-	LBR_FORMAT_LIP		= 0x01,
-	LBR_FORMAT_EIP		= 0x02,
-	LBR_FORMAT_EIP_FLAGS	= 0x03,
-	LBR_FORMAT_EIP_FLAGS2	= 0x04,
-	LBR_FORMAT_INFO		= 0x05,
-	LBR_FORMAT_TIME		= 0x06,
-	LBR_FORMAT_MAX_KNOWN    = LBR_FORMAT_TIME,
-};
-
 static const enum {
 	LBR_EIP_FLAGS		= 1,
 	LBR_TSX			= 2,
@@ -194,7 +183,7 @@ static void __intel_pmu_lbr_disable(void)
 	wrmsrl(MSR_IA32_DEBUGCTLMSR, debugctl);
 }
 
-static void intel_pmu_lbr_reset_32(void)
+void intel_pmu_lbr_reset_32(void)
 {
 	int i;
 
@@ -202,7 +191,7 @@ static void intel_pmu_lbr_reset_32(void)
 		wrmsrl(x86_pmu.lbr_from + i, 0);
 }
 
-static void intel_pmu_lbr_reset_64(void)
+void intel_pmu_lbr_reset_64(void)
 {
 	int i;
 
@@ -221,10 +210,7 @@ void intel_pmu_lbr_reset(void)
 	if (!x86_pmu.lbr_nr)
 		return;
 
-	if (x86_pmu.intel_cap.lbr_format == LBR_FORMAT_32)
-		intel_pmu_lbr_reset_32();
-	else
-		intel_pmu_lbr_reset_64();
+	x86_pmu.lbr_reset();
 
 	cpuc->last_task_ctx = NULL;
 	cpuc->last_log_id = 0;
diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h
index 81475963df99..5c1ad4360715 100644
--- a/arch/x86/events/perf_event.h
+++ b/arch/x86/events/perf_event.h
@@ -179,6 +179,17 @@ struct intel_excl_cntrs {
 struct x86_perf_task_context;
 #define MAX_LBR_ENTRIES		32
 
+enum {
+	LBR_FORMAT_32		= 0x00,
+	LBR_FORMAT_LIP		= 0x01,
+	LBR_FORMAT_EIP		= 0x02,
+	LBR_FORMAT_EIP_FLAGS	= 0x03,
+	LBR_FORMAT_EIP_FLAGS2	= 0x04,
+	LBR_FORMAT_INFO		= 0x05,
+	LBR_FORMAT_TIME		= 0x06,
+	LBR_FORMAT_MAX_KNOWN    = LBR_FORMAT_TIME,
+};
+
 enum {
 	X86_PERF_KFREE_SHARED = 0,
 	X86_PERF_KFREE_EXCL   = 1,
@@ -682,6 +693,8 @@ struct x86_pmu {
 	bool		lbr_double_abort;	   /* duplicated lbr aborts */
 	bool		lbr_pt_coexist;		   /* (LBR|BTS) may coexist with PT */
 
+	void		(*lbr_reset)(void);
+
 	/*
 	 * Intel PT/LBR/BTS are exclusive
 	 */
@@ -1058,6 +1071,10 @@ u64 lbr_from_signext_quirk_wr(u64 val);
 
 void intel_pmu_lbr_reset(void);
 
+void intel_pmu_lbr_reset_32(void);
+
+void intel_pmu_lbr_reset_64(void);
+
 void intel_pmu_lbr_add(struct perf_event *event);
 
 void intel_pmu_lbr_del(struct perf_event *event);

From c301b1d80ed5b806834fe0f739f028f65fb4fb16 Mon Sep 17 00:00:00 2001
From: Kan Liang <kan.liang@linux.intel.com>
Date: Fri, 3 Jul 2020 05:49:09 -0700
Subject: [PATCH 289/502] perf/x86/intel/lbr: Add a function pointer for LBR
 read

The method to read Architectural LBRs is different from previous
model-specific LBR. Perf has to implement a different function.

A function pointer for LBR read is introduced. Perf should initialize
the corresponding function at boot time, and avoid checking lbr_format
at run time.

The current 64-bit LBR read function is set as default.

Signed-off-by: Kan Liang <kan.liang@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/1593780569-62993-4-git-send-email-kan.liang@linux.intel.com
---
 arch/x86/events/intel/core.c | 6 +++++-
 arch/x86/events/intel/lbr.c  | 9 +++------
 arch/x86/events/perf_event.h | 5 +++++
 3 files changed, 13 insertions(+), 7 deletions(-)

diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c
index fe49e99e4fbf..6414b4799ce7 100644
--- a/arch/x86/events/intel/core.c
+++ b/arch/x86/events/intel/core.c
@@ -3980,6 +3980,7 @@ static __initconst const struct x86_pmu core_pmu = {
 	.check_period		= intel_pmu_check_period,
 
 	.lbr_reset		= intel_pmu_lbr_reset_64,
+	.lbr_read		= intel_pmu_lbr_read_64,
 };
 
 static __initconst const struct x86_pmu intel_pmu = {
@@ -4027,6 +4028,7 @@ static __initconst const struct x86_pmu intel_pmu = {
 	.aux_output_match	= intel_pmu_aux_output_match,
 
 	.lbr_reset		= intel_pmu_lbr_reset_64,
+	.lbr_read		= intel_pmu_lbr_read_64,
 };
 
 static __init void intel_clovertown_quirk(void)
@@ -4653,8 +4655,10 @@ __init int intel_pmu_init(void)
 		x86_pmu.intel_cap.capabilities = capabilities;
 	}
 
-	if (x86_pmu.intel_cap.lbr_format == LBR_FORMAT_32)
+	if (x86_pmu.intel_cap.lbr_format == LBR_FORMAT_32) {
 		x86_pmu.lbr_reset = intel_pmu_lbr_reset_32;
+		x86_pmu.lbr_read = intel_pmu_lbr_read_32;
+	}
 
 	intel_ds_init();
 
diff --git a/arch/x86/events/intel/lbr.c b/arch/x86/events/intel/lbr.c
index 7af27a766002..b8943f45ca69 100644
--- a/arch/x86/events/intel/lbr.c
+++ b/arch/x86/events/intel/lbr.c
@@ -562,7 +562,7 @@ void intel_pmu_lbr_disable_all(void)
 		__intel_pmu_lbr_disable();
 }
 
-static void intel_pmu_lbr_read_32(struct cpu_hw_events *cpuc)
+void intel_pmu_lbr_read_32(struct cpu_hw_events *cpuc)
 {
 	unsigned long mask = x86_pmu.lbr_nr - 1;
 	u64 tos = intel_pmu_lbr_tos();
@@ -599,7 +599,7 @@ static void intel_pmu_lbr_read_32(struct cpu_hw_events *cpuc)
  * is the same as the linear address, allowing us to merge the LIP and EIP
  * LBR formats.
  */
-static void intel_pmu_lbr_read_64(struct cpu_hw_events *cpuc)
+void intel_pmu_lbr_read_64(struct cpu_hw_events *cpuc)
 {
 	bool need_info = false, call_stack = false;
 	unsigned long mask = x86_pmu.lbr_nr - 1;
@@ -704,10 +704,7 @@ void intel_pmu_lbr_read(void)
 	    cpuc->lbr_users == cpuc->lbr_pebs_users)
 		return;
 
-	if (x86_pmu.intel_cap.lbr_format == LBR_FORMAT_32)
-		intel_pmu_lbr_read_32(cpuc);
-	else
-		intel_pmu_lbr_read_64(cpuc);
+	x86_pmu.lbr_read(cpuc);
 
 	intel_pmu_lbr_filter(cpuc);
 }
diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h
index 5c1ad4360715..312d27f269e6 100644
--- a/arch/x86/events/perf_event.h
+++ b/arch/x86/events/perf_event.h
@@ -694,6 +694,7 @@ struct x86_pmu {
 	bool		lbr_pt_coexist;		   /* (LBR|BTS) may coexist with PT */
 
 	void		(*lbr_reset)(void);
+	void		(*lbr_read)(struct cpu_hw_events *cpuc);
 
 	/*
 	 * Intel PT/LBR/BTS are exclusive
@@ -1085,6 +1086,10 @@ void intel_pmu_lbr_disable_all(void);
 
 void intel_pmu_lbr_read(void);
 
+void intel_pmu_lbr_read_32(struct cpu_hw_events *cpuc);
+
+void intel_pmu_lbr_read_64(struct cpu_hw_events *cpuc);
+
 void intel_pmu_lbr_init_core(void);
 
 void intel_pmu_lbr_init_nhm(void);

From 799571bf38fc2b4b744fa448184b5915739b10fd Mon Sep 17 00:00:00 2001
From: Kan Liang <kan.liang@linux.intel.com>
Date: Fri, 3 Jul 2020 05:49:10 -0700
Subject: [PATCH 290/502] perf/x86/intel/lbr: Add the function pointers for LBR
 save and restore

The MSRs of Architectural LBR are different from previous model-specific
LBR. Perf has to implement different functions to save and restore them.

The function pointers for LBR save and restore are introduced. Perf
should initialize the corresponding functions at boot time.

The generic optimizations, e.g. avoiding restore LBR if no one else
touched them, still apply for Architectural LBRs. The related codes are
not moved to model-specific functions.

Current model-specific LBR functions are set as default.

Signed-off-by: Kan Liang <kan.liang@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/1593780569-62993-5-git-send-email-kan.liang@linux.intel.com
---
 arch/x86/events/intel/core.c |  4 ++
 arch/x86/events/intel/lbr.c  | 83 ++++++++++++++++++++++--------------
 arch/x86/events/perf_event.h |  6 +++
 3 files changed, 61 insertions(+), 32 deletions(-)

diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c
index 6414b4799ce7..50cb3c69d6a4 100644
--- a/arch/x86/events/intel/core.c
+++ b/arch/x86/events/intel/core.c
@@ -3981,6 +3981,8 @@ static __initconst const struct x86_pmu core_pmu = {
 
 	.lbr_reset		= intel_pmu_lbr_reset_64,
 	.lbr_read		= intel_pmu_lbr_read_64,
+	.lbr_save		= intel_pmu_lbr_save,
+	.lbr_restore		= intel_pmu_lbr_restore,
 };
 
 static __initconst const struct x86_pmu intel_pmu = {
@@ -4029,6 +4031,8 @@ static __initconst const struct x86_pmu intel_pmu = {
 
 	.lbr_reset		= intel_pmu_lbr_reset_64,
 	.lbr_read		= intel_pmu_lbr_read_64,
+	.lbr_save		= intel_pmu_lbr_save,
+	.lbr_restore		= intel_pmu_lbr_restore,
 };
 
 static __init void intel_clovertown_quirk(void)
diff --git a/arch/x86/events/intel/lbr.c b/arch/x86/events/intel/lbr.c
index b8943f45ca69..b2b8dc973057 100644
--- a/arch/x86/events/intel/lbr.c
+++ b/arch/x86/events/intel/lbr.c
@@ -323,11 +323,41 @@ static inline u64 rdlbr_to(unsigned int idx)
 	return val;
 }
 
+void intel_pmu_lbr_restore(void *ctx)
+{
+	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+	struct x86_perf_task_context *task_ctx = ctx;
+	int i;
+	unsigned lbr_idx, mask;
+	u64 tos = task_ctx->tos;
+
+	mask = x86_pmu.lbr_nr - 1;
+	for (i = 0; i < task_ctx->valid_lbrs; i++) {
+		lbr_idx = (tos - i) & mask;
+		wrlbr_from(lbr_idx, task_ctx->lbr_from[i]);
+		wrlbr_to  (lbr_idx, task_ctx->lbr_to[i]);
+
+		if (x86_pmu.intel_cap.lbr_format == LBR_FORMAT_INFO)
+			wrmsrl(MSR_LBR_INFO_0 + lbr_idx, task_ctx->lbr_info[i]);
+	}
+
+	for (; i < x86_pmu.lbr_nr; i++) {
+		lbr_idx = (tos - i) & mask;
+		wrlbr_from(lbr_idx, 0);
+		wrlbr_to(lbr_idx, 0);
+		if (x86_pmu.intel_cap.lbr_format == LBR_FORMAT_INFO)
+			wrmsrl(MSR_LBR_INFO_0 + lbr_idx, 0);
+	}
+
+	wrmsrl(x86_pmu.lbr_tos, tos);
+
+	if (cpuc->lbr_select)
+		wrmsrl(MSR_LBR_SELECT, task_ctx->lbr_sel);
+}
+
 static void __intel_pmu_lbr_restore(struct x86_perf_task_context *task_ctx)
 {
 	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
-	int i;
-	unsigned lbr_idx, mask;
 	u64 tos;
 
 	if (task_ctx->lbr_callstack_users == 0 ||
@@ -349,43 +379,19 @@ static void __intel_pmu_lbr_restore(struct x86_perf_task_context *task_ctx)
 		return;
 	}
 
-	mask = x86_pmu.lbr_nr - 1;
-	for (i = 0; i < task_ctx->valid_lbrs; i++) {
-		lbr_idx = (tos - i) & mask;
-		wrlbr_from(lbr_idx, task_ctx->lbr_from[i]);
-		wrlbr_to  (lbr_idx, task_ctx->lbr_to[i]);
+	x86_pmu.lbr_restore(task_ctx);
 
-		if (x86_pmu.intel_cap.lbr_format == LBR_FORMAT_INFO)
-			wrmsrl(MSR_LBR_INFO_0 + lbr_idx, task_ctx->lbr_info[i]);
-	}
-
-	for (; i < x86_pmu.lbr_nr; i++) {
-		lbr_idx = (tos - i) & mask;
-		wrlbr_from(lbr_idx, 0);
-		wrlbr_to(lbr_idx, 0);
-		if (x86_pmu.intel_cap.lbr_format == LBR_FORMAT_INFO)
-			wrmsrl(MSR_LBR_INFO_0 + lbr_idx, 0);
-	}
-
-	wrmsrl(x86_pmu.lbr_tos, tos);
 	task_ctx->lbr_stack_state = LBR_NONE;
-
-	if (cpuc->lbr_select)
-		wrmsrl(MSR_LBR_SELECT, task_ctx->lbr_sel);
 }
 
-static void __intel_pmu_lbr_save(struct x86_perf_task_context *task_ctx)
+void intel_pmu_lbr_save(void *ctx)
 {
 	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+	struct x86_perf_task_context *task_ctx = ctx;
 	unsigned lbr_idx, mask;
 	u64 tos, from;
 	int i;
 
-	if (task_ctx->lbr_callstack_users == 0) {
-		task_ctx->lbr_stack_state = LBR_NONE;
-		return;
-	}
-
 	mask = x86_pmu.lbr_nr - 1;
 	tos = intel_pmu_lbr_tos();
 	for (i = 0; i < x86_pmu.lbr_nr; i++) {
@@ -400,13 +406,26 @@ static void __intel_pmu_lbr_save(struct x86_perf_task_context *task_ctx)
 	}
 	task_ctx->valid_lbrs = i;
 	task_ctx->tos = tos;
+
+	if (cpuc->lbr_select)
+		rdmsrl(MSR_LBR_SELECT, task_ctx->lbr_sel);
+}
+
+static void __intel_pmu_lbr_save(struct x86_perf_task_context *task_ctx)
+{
+	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+
+	if (task_ctx->lbr_callstack_users == 0) {
+		task_ctx->lbr_stack_state = LBR_NONE;
+		return;
+	}
+
+	x86_pmu.lbr_save(task_ctx);
+
 	task_ctx->lbr_stack_state = LBR_VALID;
 
 	cpuc->last_task_ctx = task_ctx;
 	cpuc->last_log_id = ++task_ctx->log_id;
-
-	if (cpuc->lbr_select)
-		rdmsrl(MSR_LBR_SELECT, task_ctx->lbr_sel);
 }
 
 void intel_pmu_lbr_swap_task_ctx(struct perf_event_context *prev,
diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h
index 312d27f269e6..6d11813582c0 100644
--- a/arch/x86/events/perf_event.h
+++ b/arch/x86/events/perf_event.h
@@ -695,6 +695,8 @@ struct x86_pmu {
 
 	void		(*lbr_reset)(void);
 	void		(*lbr_read)(struct cpu_hw_events *cpuc);
+	void		(*lbr_save)(void *ctx);
+	void		(*lbr_restore)(void *ctx);
 
 	/*
 	 * Intel PT/LBR/BTS are exclusive
@@ -1090,6 +1092,10 @@ void intel_pmu_lbr_read_32(struct cpu_hw_events *cpuc);
 
 void intel_pmu_lbr_read_64(struct cpu_hw_events *cpuc);
 
+void intel_pmu_lbr_save(void *ctx);
+
+void intel_pmu_lbr_restore(void *ctx);
+
 void intel_pmu_lbr_init_core(void);
 
 void intel_pmu_lbr_init_nhm(void);

From 530bfff6480307d210734222a54d56af7f908957 Mon Sep 17 00:00:00 2001
From: Kan Liang <kan.liang@linux.intel.com>
Date: Fri, 3 Jul 2020 05:49:11 -0700
Subject: [PATCH 291/502] perf/x86/intel/lbr: Factor out a new struct for
 generic optimization

To reduce the overhead of a context switch with LBR enabled, some
generic optimizations were introduced, e.g. avoiding restore LBR if no
one else touched them. The generic optimizations can also be used by
Architecture LBR later. Currently, the fields for the generic
optimizations are part of structure x86_perf_task_context, which will be
deprecated by Architecture LBR. A new structure should be introduced
for the common fields of generic optimization, which can be shared
between Architecture LBR and model-specific LBR.

Both 'valid_lbrs' and 'tos' are also used by the generic optimizations,
but they are not moved into the new structure, because Architecture LBR
is stack-like. The 'valid_lbrs' which records the index of the valid LBR
is not required anymore. The TOS MSR will be removed.

LBR registers may be cleared in the deep Cstate. If so, the generic
optimizations should not be applied. Perf has to unconditionally
restore the LBR registers. A generic function is required to detect the
reset due to the deep Cstate. lbr_is_reset_in_cstate() is introduced.
Currently, for the model-specific LBR, the TOS MSR is used to detect the
reset. There will be another method introduced for Architecture LBR
later.

Signed-off-by: Kan Liang <kan.liang@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/1593780569-62993-6-git-send-email-kan.liang@linux.intel.com
---
 arch/x86/events/intel/lbr.c  | 38 ++++++++++++++++++++----------------
 arch/x86/events/perf_event.h | 10 +++++++---
 2 files changed, 28 insertions(+), 20 deletions(-)

diff --git a/arch/x86/events/intel/lbr.c b/arch/x86/events/intel/lbr.c
index b2b8dc973057..bba9939635b6 100644
--- a/arch/x86/events/intel/lbr.c
+++ b/arch/x86/events/intel/lbr.c
@@ -355,33 +355,37 @@ void intel_pmu_lbr_restore(void *ctx)
 		wrmsrl(MSR_LBR_SELECT, task_ctx->lbr_sel);
 }
 
+static __always_inline bool
+lbr_is_reset_in_cstate(struct x86_perf_task_context *task_ctx)
+{
+	return !rdlbr_from(task_ctx->tos);
+}
+
 static void __intel_pmu_lbr_restore(struct x86_perf_task_context *task_ctx)
 {
 	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
-	u64 tos;
 
-	if (task_ctx->lbr_callstack_users == 0 ||
-	    task_ctx->lbr_stack_state == LBR_NONE) {
+	if (task_ctx->opt.lbr_callstack_users == 0 ||
+	    task_ctx->opt.lbr_stack_state == LBR_NONE) {
 		intel_pmu_lbr_reset();
 		return;
 	}
 
-	tos = task_ctx->tos;
 	/*
 	 * Does not restore the LBR registers, if
 	 * - No one else touched them, and
-	 * - Did not enter C6
+	 * - Was not cleared in Cstate
 	 */
 	if ((task_ctx == cpuc->last_task_ctx) &&
-	    (task_ctx->log_id == cpuc->last_log_id) &&
-	    rdlbr_from(tos)) {
-		task_ctx->lbr_stack_state = LBR_NONE;
+	    (task_ctx->opt.log_id == cpuc->last_log_id) &&
+	    !lbr_is_reset_in_cstate(task_ctx)) {
+		task_ctx->opt.lbr_stack_state = LBR_NONE;
 		return;
 	}
 
 	x86_pmu.lbr_restore(task_ctx);
 
-	task_ctx->lbr_stack_state = LBR_NONE;
+	task_ctx->opt.lbr_stack_state = LBR_NONE;
 }
 
 void intel_pmu_lbr_save(void *ctx)
@@ -415,17 +419,17 @@ static void __intel_pmu_lbr_save(struct x86_perf_task_context *task_ctx)
 {
 	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
 
-	if (task_ctx->lbr_callstack_users == 0) {
-		task_ctx->lbr_stack_state = LBR_NONE;
+	if (task_ctx->opt.lbr_callstack_users == 0) {
+		task_ctx->opt.lbr_stack_state = LBR_NONE;
 		return;
 	}
 
 	x86_pmu.lbr_save(task_ctx);
 
-	task_ctx->lbr_stack_state = LBR_VALID;
+	task_ctx->opt.lbr_stack_state = LBR_VALID;
 
 	cpuc->last_task_ctx = task_ctx;
-	cpuc->last_log_id = ++task_ctx->log_id;
+	cpuc->last_log_id = ++task_ctx->opt.log_id;
 }
 
 void intel_pmu_lbr_swap_task_ctx(struct perf_event_context *prev,
@@ -447,8 +451,8 @@ void intel_pmu_lbr_swap_task_ctx(struct perf_event_context *prev,
 	if (!prev_ctx_data || !next_ctx_data)
 		return;
 
-	swap(prev_ctx_data->lbr_callstack_users,
-	     next_ctx_data->lbr_callstack_users);
+	swap(prev_ctx_data->opt.lbr_callstack_users,
+	     next_ctx_data->opt.lbr_callstack_users);
 }
 
 void intel_pmu_lbr_sched_task(struct perf_event_context *ctx, bool sched_in)
@@ -503,7 +507,7 @@ void intel_pmu_lbr_add(struct perf_event *event)
 
 	if (branch_user_callstack(cpuc->br_sel) && event->ctx->task_ctx_data) {
 		task_ctx = event->ctx->task_ctx_data;
-		task_ctx->lbr_callstack_users++;
+		task_ctx->opt.lbr_callstack_users++;
 	}
 
 	/*
@@ -543,7 +547,7 @@ void intel_pmu_lbr_del(struct perf_event *event)
 	if (branch_user_callstack(cpuc->br_sel) &&
 	    event->ctx->task_ctx_data) {
 		task_ctx = event->ctx->task_ctx_data;
-		task_ctx->lbr_callstack_users--;
+		task_ctx->opt.lbr_callstack_users--;
 	}
 
 	if (event->hw.flags & PERF_X86_EVENT_LBR_SELECT)
diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h
index 6d11813582c0..96d73cd8b7a1 100644
--- a/arch/x86/events/perf_event.h
+++ b/arch/x86/events/perf_event.h
@@ -736,6 +736,12 @@ struct x86_pmu {
 	int (*aux_output_match) (struct perf_event *event);
 };
 
+struct x86_perf_task_context_opt {
+	int lbr_callstack_users;
+	int lbr_stack_state;
+	int log_id;
+};
+
 struct x86_perf_task_context {
 	u64 lbr_from[MAX_LBR_ENTRIES];
 	u64 lbr_to[MAX_LBR_ENTRIES];
@@ -743,9 +749,7 @@ struct x86_perf_task_context {
 	u64 lbr_sel;
 	int tos;
 	int valid_lbrs;
-	int lbr_callstack_users;
-	int lbr_stack_state;
-	int log_id;
+	struct x86_perf_task_context_opt opt;
 };
 
 #define x86_add_quirk(func_)						\

From f42be8651a7a9d5cb165e5d176fc0b09621b4f4d Mon Sep 17 00:00:00 2001
From: Kan Liang <kan.liang@linux.intel.com>
Date: Fri, 3 Jul 2020 05:49:12 -0700
Subject: [PATCH 292/502] perf/x86/intel/lbr: Use dynamic data structure for
 task_ctx

The type of task_ctx is hardcoded as struct x86_perf_task_context,
which doesn't apply for Architecture LBR. For example, Architecture LBR
doesn't have the TOS MSR. The number of LBR entries is variable. A new
struct will be introduced for Architecture LBR. Perf has to determine
the type of task_ctx at run time.

The type of task_ctx pointer is changed to 'void *', which will be
determined at run time.

The generic LBR optimization can be shared between Architecture LBR and
model-specific LBR. Both need to access the structure for the generic
LBR optimization. A helper task_context_opt() is introduced to retrieve
the pointer of the structure at run time.

Signed-off-by: Kan Liang <kan.liang@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/1593780569-62993-7-git-send-email-kan.liang@linux.intel.com
---
 arch/x86/events/intel/lbr.c  | 59 ++++++++++++++++--------------------
 arch/x86/events/perf_event.h |  7 ++++-
 2 files changed, 32 insertions(+), 34 deletions(-)

diff --git a/arch/x86/events/intel/lbr.c b/arch/x86/events/intel/lbr.c
index bba9939635b6..e62baa996474 100644
--- a/arch/x86/events/intel/lbr.c
+++ b/arch/x86/events/intel/lbr.c
@@ -355,18 +355,17 @@ void intel_pmu_lbr_restore(void *ctx)
 		wrmsrl(MSR_LBR_SELECT, task_ctx->lbr_sel);
 }
 
-static __always_inline bool
-lbr_is_reset_in_cstate(struct x86_perf_task_context *task_ctx)
+static __always_inline bool lbr_is_reset_in_cstate(void *ctx)
 {
-	return !rdlbr_from(task_ctx->tos);
+	return !rdlbr_from(((struct x86_perf_task_context *)ctx)->tos);
 }
 
-static void __intel_pmu_lbr_restore(struct x86_perf_task_context *task_ctx)
+static void __intel_pmu_lbr_restore(void *ctx)
 {
 	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
 
-	if (task_ctx->opt.lbr_callstack_users == 0 ||
-	    task_ctx->opt.lbr_stack_state == LBR_NONE) {
+	if (task_context_opt(ctx)->lbr_callstack_users == 0 ||
+	    task_context_opt(ctx)->lbr_stack_state == LBR_NONE) {
 		intel_pmu_lbr_reset();
 		return;
 	}
@@ -376,16 +375,16 @@ static void __intel_pmu_lbr_restore(struct x86_perf_task_context *task_ctx)
 	 * - No one else touched them, and
 	 * - Was not cleared in Cstate
 	 */
-	if ((task_ctx == cpuc->last_task_ctx) &&
-	    (task_ctx->opt.log_id == cpuc->last_log_id) &&
-	    !lbr_is_reset_in_cstate(task_ctx)) {
-		task_ctx->opt.lbr_stack_state = LBR_NONE;
+	if ((ctx == cpuc->last_task_ctx) &&
+	    (task_context_opt(ctx)->log_id == cpuc->last_log_id) &&
+	    !lbr_is_reset_in_cstate(ctx)) {
+		task_context_opt(ctx)->lbr_stack_state = LBR_NONE;
 		return;
 	}
 
-	x86_pmu.lbr_restore(task_ctx);
+	x86_pmu.lbr_restore(ctx);
 
-	task_ctx->opt.lbr_stack_state = LBR_NONE;
+	task_context_opt(ctx)->lbr_stack_state = LBR_NONE;
 }
 
 void intel_pmu_lbr_save(void *ctx)
@@ -415,27 +414,27 @@ void intel_pmu_lbr_save(void *ctx)
 		rdmsrl(MSR_LBR_SELECT, task_ctx->lbr_sel);
 }
 
-static void __intel_pmu_lbr_save(struct x86_perf_task_context *task_ctx)
+static void __intel_pmu_lbr_save(void *ctx)
 {
 	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
 
-	if (task_ctx->opt.lbr_callstack_users == 0) {
-		task_ctx->opt.lbr_stack_state = LBR_NONE;
+	if (task_context_opt(ctx)->lbr_callstack_users == 0) {
+		task_context_opt(ctx)->lbr_stack_state = LBR_NONE;
 		return;
 	}
 
-	x86_pmu.lbr_save(task_ctx);
+	x86_pmu.lbr_save(ctx);
 
-	task_ctx->opt.lbr_stack_state = LBR_VALID;
+	task_context_opt(ctx)->lbr_stack_state = LBR_VALID;
 
-	cpuc->last_task_ctx = task_ctx;
-	cpuc->last_log_id = ++task_ctx->opt.log_id;
+	cpuc->last_task_ctx = ctx;
+	cpuc->last_log_id = ++task_context_opt(ctx)->log_id;
 }
 
 void intel_pmu_lbr_swap_task_ctx(struct perf_event_context *prev,
 				 struct perf_event_context *next)
 {
-	struct x86_perf_task_context *prev_ctx_data, *next_ctx_data;
+	void *prev_ctx_data, *next_ctx_data;
 
 	swap(prev->task_ctx_data, next->task_ctx_data);
 
@@ -451,14 +450,14 @@ void intel_pmu_lbr_swap_task_ctx(struct perf_event_context *prev,
 	if (!prev_ctx_data || !next_ctx_data)
 		return;
 
-	swap(prev_ctx_data->opt.lbr_callstack_users,
-	     next_ctx_data->opt.lbr_callstack_users);
+	swap(task_context_opt(prev_ctx_data)->lbr_callstack_users,
+	     task_context_opt(next_ctx_data)->lbr_callstack_users);
 }
 
 void intel_pmu_lbr_sched_task(struct perf_event_context *ctx, bool sched_in)
 {
 	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
-	struct x86_perf_task_context *task_ctx;
+	void *task_ctx;
 
 	if (!cpuc->lbr_users)
 		return;
@@ -495,7 +494,6 @@ static inline bool branch_user_callstack(unsigned br_sel)
 void intel_pmu_lbr_add(struct perf_event *event)
 {
 	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
-	struct x86_perf_task_context *task_ctx;
 
 	if (!x86_pmu.lbr_nr)
 		return;
@@ -505,10 +503,8 @@ void intel_pmu_lbr_add(struct perf_event *event)
 
 	cpuc->br_sel = event->hw.branch_reg.reg;
 
-	if (branch_user_callstack(cpuc->br_sel) && event->ctx->task_ctx_data) {
-		task_ctx = event->ctx->task_ctx_data;
-		task_ctx->opt.lbr_callstack_users++;
-	}
+	if (branch_user_callstack(cpuc->br_sel) && event->ctx->task_ctx_data)
+		task_context_opt(event->ctx->task_ctx_data)->lbr_callstack_users++;
 
 	/*
 	 * Request pmu::sched_task() callback, which will fire inside the
@@ -539,16 +535,13 @@ void intel_pmu_lbr_add(struct perf_event *event)
 void intel_pmu_lbr_del(struct perf_event *event)
 {
 	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
-	struct x86_perf_task_context *task_ctx;
 
 	if (!x86_pmu.lbr_nr)
 		return;
 
 	if (branch_user_callstack(cpuc->br_sel) &&
-	    event->ctx->task_ctx_data) {
-		task_ctx = event->ctx->task_ctx_data;
-		task_ctx->opt.lbr_callstack_users--;
-	}
+	    event->ctx->task_ctx_data)
+		task_context_opt(event->ctx->task_ctx_data)->lbr_callstack_users--;
 
 	if (event->hw.flags & PERF_X86_EVENT_LBR_SELECT)
 		cpuc->lbr_select = 0;
diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h
index 96d73cd8b7a1..7dbf1480b0a2 100644
--- a/arch/x86/events/perf_event.h
+++ b/arch/x86/events/perf_event.h
@@ -247,7 +247,7 @@ struct cpu_hw_events {
 	struct perf_branch_entry	lbr_entries[MAX_LBR_ENTRIES];
 	struct er_account		*lbr_sel;
 	u64				br_sel;
-	struct x86_perf_task_context	*last_task_ctx;
+	void				*last_task_ctx;
 	int				last_log_id;
 	int				lbr_select;
 
@@ -800,6 +800,11 @@ static struct perf_pmu_events_ht_attr event_attr_##v = {		\
 struct pmu *x86_get_pmu(void);
 extern struct x86_pmu x86_pmu __read_mostly;
 
+static __always_inline struct x86_perf_task_context_opt *task_context_opt(void *ctx)
+{
+	return &((struct x86_perf_task_context *)ctx)->opt;
+}
+
 static inline bool x86_pmu_has_lbr_callstack(void)
 {
 	return  x86_pmu.lbr_sel_map &&

From d6a162a41bfd2ff9ea4cbb338d3df6a3f9b7e89f Mon Sep 17 00:00:00 2001
From: Kan Liang <kan.liang@linux.intel.com>
Date: Fri, 3 Jul 2020 05:49:13 -0700
Subject: [PATCH 293/502] x86/msr-index: Add bunch of MSRs for Arch LBR

Add Arch LBR related MSRs and the new LBR INFO bits in MSR-index.

Signed-off-by: Kan Liang <kan.liang@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/1593780569-62993-8-git-send-email-kan.liang@linux.intel.com
---
 arch/x86/include/asm/msr-index.h | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
index e8370e64a155..bdc07fc6e517 100644
--- a/arch/x86/include/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
@@ -158,7 +158,23 @@
 #define LBR_INFO_MISPRED		BIT_ULL(63)
 #define LBR_INFO_IN_TX			BIT_ULL(62)
 #define LBR_INFO_ABORT			BIT_ULL(61)
+#define LBR_INFO_CYC_CNT_VALID		BIT_ULL(60)
 #define LBR_INFO_CYCLES			0xffff
+#define LBR_INFO_BR_TYPE_OFFSET		56
+#define LBR_INFO_BR_TYPE		(0xfull << LBR_INFO_BR_TYPE_OFFSET)
+
+#define MSR_ARCH_LBR_CTL		0x000014ce
+#define ARCH_LBR_CTL_LBREN		BIT(0)
+#define ARCH_LBR_CTL_CPL_OFFSET		1
+#define ARCH_LBR_CTL_CPL		(0x3ull << ARCH_LBR_CTL_CPL_OFFSET)
+#define ARCH_LBR_CTL_STACK_OFFSET	3
+#define ARCH_LBR_CTL_STACK		(0x1ull << ARCH_LBR_CTL_STACK_OFFSET)
+#define ARCH_LBR_CTL_FILTER_OFFSET	16
+#define ARCH_LBR_CTL_FILTER		(0x7full << ARCH_LBR_CTL_FILTER_OFFSET)
+#define MSR_ARCH_LBR_DEPTH		0x000014cf
+#define MSR_ARCH_LBR_FROM_0		0x00001500
+#define MSR_ARCH_LBR_TO_0		0x00001600
+#define MSR_ARCH_LBR_INFO_0		0x00001200
 
 #define MSR_IA32_PEBS_ENABLE		0x000003f1
 #define MSR_PEBS_DATA_CFG		0x000003f2

From af6cf129706b2f79e12f97e62d977e7f653cdfd1 Mon Sep 17 00:00:00 2001
From: Kan Liang <kan.liang@linux.intel.com>
Date: Fri, 3 Jul 2020 05:49:14 -0700
Subject: [PATCH 294/502] perf/x86: Expose CPUID enumeration bits for arch LBR

The LBR capabilities of Architecture LBR are retrieved from the CPUID
enumeration once at boot time. The capabilities have to be saved for
future usage.

Several new fields are added into structure x86_pmu to indicate the
capabilities. The fields will be used in the following patches.

Signed-off-by: Kan Liang <kan.liang@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/1593780569-62993-9-git-send-email-kan.liang@linux.intel.com
---
 arch/x86/events/perf_event.h      | 13 ++++++++++
 arch/x86/include/asm/perf_event.h | 40 +++++++++++++++++++++++++++++++
 2 files changed, 53 insertions(+)

diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h
index 7dbf1480b0a2..cc8117764c08 100644
--- a/arch/x86/events/perf_event.h
+++ b/arch/x86/events/perf_event.h
@@ -693,6 +693,19 @@ struct x86_pmu {
 	bool		lbr_double_abort;	   /* duplicated lbr aborts */
 	bool		lbr_pt_coexist;		   /* (LBR|BTS) may coexist with PT */
 
+	/*
+	 * Intel Architectural LBR CPUID Enumeration
+	 */
+	unsigned int	lbr_depth_mask:8;
+	unsigned int	lbr_deep_c_reset:1;
+	unsigned int	lbr_lip:1;
+	unsigned int	lbr_cpl:1;
+	unsigned int	lbr_filter:1;
+	unsigned int	lbr_call_stack:1;
+	unsigned int	lbr_mispred:1;
+	unsigned int	lbr_timed_lbr:1;
+	unsigned int	lbr_br_type:1;
+
 	void		(*lbr_reset)(void);
 	void		(*lbr_read)(struct cpu_hw_events *cpuc);
 	void		(*lbr_save)(void *ctx);
diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h
index 2df707311d17..9ffce7d31c4c 100644
--- a/arch/x86/include/asm/perf_event.h
+++ b/arch/x86/include/asm/perf_event.h
@@ -142,6 +142,46 @@ union cpuid10_edx {
 	unsigned int full;
 };
 
+/*
+ * Intel Architectural LBR CPUID detection/enumeration details:
+ */
+union cpuid28_eax {
+	struct {
+		/* Supported LBR depth values */
+		unsigned int	lbr_depth_mask:8;
+		unsigned int	reserved:22;
+		/* Deep C-state Reset */
+		unsigned int	lbr_deep_c_reset:1;
+		/* IP values contain LIP */
+		unsigned int	lbr_lip:1;
+	} split;
+	unsigned int		full;
+};
+
+union cpuid28_ebx {
+	struct {
+		/* CPL Filtering Supported */
+		unsigned int    lbr_cpl:1;
+		/* Branch Filtering Supported */
+		unsigned int    lbr_filter:1;
+		/* Call-stack Mode Supported */
+		unsigned int    lbr_call_stack:1;
+	} split;
+	unsigned int            full;
+};
+
+union cpuid28_ecx {
+	struct {
+		/* Mispredict Bit Supported */
+		unsigned int    lbr_mispred:1;
+		/* Timed LBRs Supported */
+		unsigned int    lbr_timed_lbr:1;
+		/* Branch Type Field Supported */
+		unsigned int    lbr_br_type:1;
+	} split;
+	unsigned int            full;
+};
+
 struct x86_pmu_capability {
 	int		version;
 	int		num_counters_gp;

From 49d8184f2036ff5b8d1eea3d61bac8b23420eca7 Mon Sep 17 00:00:00 2001
From: Kan Liang <kan.liang@linux.intel.com>
Date: Fri, 3 Jul 2020 05:49:15 -0700
Subject: [PATCH 295/502] perf/x86/intel/lbr: Support LBR_CTL

An IA32_LBR_CTL is introduced for Architecture LBR to enable and config
LBR registers to replace the previous LBR_SELECT.

All the related members in struct cpu_hw_events and struct x86_pmu
have to be renamed.

Some new macros are added to reflect the layout of LBR_CTL.

The mapping from PERF_SAMPLE_BRANCH_* to the corresponding bits in
LBR_CTL MSR is saved in lbr_ctl_map now, which is not a const value.
The value relies on the CPUID enumeration.

For the previous model-specific LBR, most of the bits in LBR_SELECT
operate in the suppressed mode. For the bits in LBR_CTL, the polarity is
inverted.

For the previous model-specific LBR format 5 (LBR_FORMAT_INFO), if the
NO_CYCLES and NO_FLAGS type are set, the flag LBR_NO_INFO will be set to
avoid the unnecessary LBR_INFO MSR read. Although Architecture LBR also
has a dedicated LBR_INFO MSR, perf doesn't need to check and set the
flag LBR_NO_INFO. For Architecture LBR, XSAVES instruction will be used
as the default way to read the LBR MSRs all together. The overhead which
the flag tries to avoid doesn't exist anymore. Dropping the flag can
save the extra check for the flag in the lbr_read() later, and make the
code cleaner.

Signed-off-by: Kan Liang <kan.liang@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/1593780569-62993-10-git-send-email-kan.liang@linux.intel.com
---
 arch/x86/events/intel/lbr.c  | 43 ++++++++++++++++++++++++++++++++++++
 arch/x86/events/perf_event.h | 15 ++++++++++---
 2 files changed, 55 insertions(+), 3 deletions(-)

diff --git a/arch/x86/events/intel/lbr.c b/arch/x86/events/intel/lbr.c
index e62baa996474..77425624752c 100644
--- a/arch/x86/events/intel/lbr.c
+++ b/arch/x86/events/intel/lbr.c
@@ -132,6 +132,44 @@ enum {
 	 X86_BR_IRQ		|\
 	 X86_BR_INT)
 
+/*
+ * Intel LBR_CTL bits
+ *
+ * Hardware branch filter for Arch LBR
+ */
+#define ARCH_LBR_KERNEL_BIT		1  /* capture at ring0 */
+#define ARCH_LBR_USER_BIT		2  /* capture at ring > 0 */
+#define ARCH_LBR_CALL_STACK_BIT		3  /* enable call stack */
+#define ARCH_LBR_JCC_BIT		16 /* capture conditional branches */
+#define ARCH_LBR_REL_JMP_BIT		17 /* capture relative jumps */
+#define ARCH_LBR_IND_JMP_BIT		18 /* capture indirect jumps */
+#define ARCH_LBR_REL_CALL_BIT		19 /* capture relative calls */
+#define ARCH_LBR_IND_CALL_BIT		20 /* capture indirect calls */
+#define ARCH_LBR_RETURN_BIT		21 /* capture near returns */
+#define ARCH_LBR_OTHER_BRANCH_BIT	22 /* capture other branches */
+
+#define ARCH_LBR_KERNEL			(1ULL << ARCH_LBR_KERNEL_BIT)
+#define ARCH_LBR_USER			(1ULL << ARCH_LBR_USER_BIT)
+#define ARCH_LBR_CALL_STACK		(1ULL << ARCH_LBR_CALL_STACK_BIT)
+#define ARCH_LBR_JCC			(1ULL << ARCH_LBR_JCC_BIT)
+#define ARCH_LBR_REL_JMP		(1ULL << ARCH_LBR_REL_JMP_BIT)
+#define ARCH_LBR_IND_JMP		(1ULL << ARCH_LBR_IND_JMP_BIT)
+#define ARCH_LBR_REL_CALL		(1ULL << ARCH_LBR_REL_CALL_BIT)
+#define ARCH_LBR_IND_CALL		(1ULL << ARCH_LBR_IND_CALL_BIT)
+#define ARCH_LBR_RETURN			(1ULL << ARCH_LBR_RETURN_BIT)
+#define ARCH_LBR_OTHER_BRANCH		(1ULL << ARCH_LBR_OTHER_BRANCH_BIT)
+
+#define ARCH_LBR_ANY			 \
+	(ARCH_LBR_JCC			|\
+	 ARCH_LBR_REL_JMP		|\
+	 ARCH_LBR_IND_JMP		|\
+	 ARCH_LBR_REL_CALL		|\
+	 ARCH_LBR_IND_CALL		|\
+	 ARCH_LBR_RETURN		|\
+	 ARCH_LBR_OTHER_BRANCH)
+
+#define ARCH_LBR_CTL_MASK			0x7f000e
+
 static void intel_pmu_lbr_filter(struct cpu_hw_events *cpuc);
 
 /*
@@ -820,6 +858,11 @@ static int intel_pmu_setup_hw_lbr_filter(struct perf_event *event)
 	reg = &event->hw.branch_reg;
 	reg->idx = EXTRA_REG_LBR;
 
+	if (static_cpu_has(X86_FEATURE_ARCH_LBR)) {
+		reg->config = mask;
+		return 0;
+	}
+
 	/*
 	 * The first 9 bits (LBR_SEL_MASK) in LBR_SELECT operate
 	 * in suppress mode. So LBR_SELECT should be set to
diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h
index cc8117764c08..ba89e563b2aa 100644
--- a/arch/x86/events/perf_event.h
+++ b/arch/x86/events/perf_event.h
@@ -245,7 +245,10 @@ struct cpu_hw_events {
 	int				lbr_pebs_users;
 	struct perf_branch_stack	lbr_stack;
 	struct perf_branch_entry	lbr_entries[MAX_LBR_ENTRIES];
-	struct er_account		*lbr_sel;
+	union {
+		struct er_account		*lbr_sel;
+		struct er_account		*lbr_ctl;
+	};
 	u64				br_sel;
 	void				*last_task_ctx;
 	int				last_log_id;
@@ -688,8 +691,14 @@ struct x86_pmu {
 	 */
 	unsigned int	lbr_tos, lbr_from, lbr_to,
 			lbr_nr;			   /* LBR base regs and size */
-	u64		lbr_sel_mask;		   /* LBR_SELECT valid bits */
-	const int	*lbr_sel_map;		   /* lbr_select mappings */
+	union {
+		u64	lbr_sel_mask;		   /* LBR_SELECT valid bits */
+		u64	lbr_ctl_mask;		   /* LBR_CTL valid bits */
+	};
+	union {
+		const int	*lbr_sel_map;	   /* lbr_select mappings */
+		int		*lbr_ctl_map;	   /* LBR_CTL mappings */
+	};
 	bool		lbr_double_abort;	   /* duplicated lbr aborts */
 	bool		lbr_pt_coexist;		   /* (LBR|BTS) may coexist with PT */
 

From 5624986dc61b81a77fb6136bc232593483d1c254 Mon Sep 17 00:00:00 2001
From: Kan Liang <kan.liang@linux.intel.com>
Date: Fri, 3 Jul 2020 05:49:16 -0700
Subject: [PATCH 296/502] perf/x86/intel/lbr: Unify the stored format of LBR
 information

Current LBR information in the structure x86_perf_task_context is stored
in a different format from the PEBS LBR record and Architecture LBR,
which prevents the sharing of the common codes.

Use the format of the PEBS LBR record as a unified format. Use a generic
name lbr_entry to replace pebs_lbr_entry.

Signed-off-by: Kan Liang <kan.liang@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/1593780569-62993-11-git-send-email-kan.liang@linux.intel.com
---
 arch/x86/events/intel/ds.c        |  6 +++---
 arch/x86/events/intel/lbr.c       | 20 ++++++++++----------
 arch/x86/events/perf_event.h      |  6 ++----
 arch/x86/include/asm/perf_event.h |  6 +-----
 4 files changed, 16 insertions(+), 22 deletions(-)

diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c
index dc43cc124e09..86848c57b55e 100644
--- a/arch/x86/events/intel/ds.c
+++ b/arch/x86/events/intel/ds.c
@@ -954,7 +954,7 @@ static void adaptive_pebs_record_size_update(void)
 	if (pebs_data_cfg & PEBS_DATACFG_XMMS)
 		sz += sizeof(struct pebs_xmm);
 	if (pebs_data_cfg & PEBS_DATACFG_LBRS)
-		sz += x86_pmu.lbr_nr * sizeof(struct pebs_lbr_entry);
+		sz += x86_pmu.lbr_nr * sizeof(struct lbr_entry);
 
 	cpuc->pebs_record_size = sz;
 }
@@ -1595,10 +1595,10 @@ static void setup_pebs_adaptive_sample_data(struct perf_event *event,
 	}
 
 	if (format_size & PEBS_DATACFG_LBRS) {
-		struct pebs_lbr *lbr = next_record;
+		struct lbr_entry *lbr = next_record;
 		int num_lbr = ((format_size >> PEBS_DATACFG_LBR_SHIFT)
 					& 0xff) + 1;
-		next_record = next_record + num_lbr*sizeof(struct pebs_lbr_entry);
+		next_record = next_record + num_lbr * sizeof(struct lbr_entry);
 
 		if (has_branch_stack(event)) {
 			intel_pmu_store_pebs_lbrs(lbr);
diff --git a/arch/x86/events/intel/lbr.c b/arch/x86/events/intel/lbr.c
index 77425624752c..b8baaf15c5f4 100644
--- a/arch/x86/events/intel/lbr.c
+++ b/arch/x86/events/intel/lbr.c
@@ -372,11 +372,11 @@ void intel_pmu_lbr_restore(void *ctx)
 	mask = x86_pmu.lbr_nr - 1;
 	for (i = 0; i < task_ctx->valid_lbrs; i++) {
 		lbr_idx = (tos - i) & mask;
-		wrlbr_from(lbr_idx, task_ctx->lbr_from[i]);
-		wrlbr_to  (lbr_idx, task_ctx->lbr_to[i]);
+		wrlbr_from(lbr_idx, task_ctx->lbr[i].from);
+		wrlbr_to(lbr_idx, task_ctx->lbr[i].to);
 
 		if (x86_pmu.intel_cap.lbr_format == LBR_FORMAT_INFO)
-			wrmsrl(MSR_LBR_INFO_0 + lbr_idx, task_ctx->lbr_info[i]);
+			wrmsrl(MSR_LBR_INFO_0 + lbr_idx, task_ctx->lbr[i].info);
 	}
 
 	for (; i < x86_pmu.lbr_nr; i++) {
@@ -440,10 +440,10 @@ void intel_pmu_lbr_save(void *ctx)
 		from = rdlbr_from(lbr_idx);
 		if (!from)
 			break;
-		task_ctx->lbr_from[i] = from;
-		task_ctx->lbr_to[i]   = rdlbr_to(lbr_idx);
+		task_ctx->lbr[i].from = from;
+		task_ctx->lbr[i].to = rdlbr_to(lbr_idx);
 		if (x86_pmu.intel_cap.lbr_format == LBR_FORMAT_INFO)
-			rdmsrl(MSR_LBR_INFO_0 + lbr_idx, task_ctx->lbr_info[i]);
+			rdmsrl(MSR_LBR_INFO_0 + lbr_idx, task_ctx->lbr[i].info);
 	}
 	task_ctx->valid_lbrs = i;
 	task_ctx->tos = tos;
@@ -1179,7 +1179,7 @@ intel_pmu_lbr_filter(struct cpu_hw_events *cpuc)
 	}
 }
 
-void intel_pmu_store_pebs_lbrs(struct pebs_lbr *lbr)
+void intel_pmu_store_pebs_lbrs(struct lbr_entry *lbr)
 {
 	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
 	int i;
@@ -1193,11 +1193,11 @@ void intel_pmu_store_pebs_lbrs(struct pebs_lbr *lbr)
 		cpuc->lbr_stack.hw_idx = intel_pmu_lbr_tos();
 
 	for (i = 0; i < x86_pmu.lbr_nr; i++) {
-		u64 info = lbr->lbr[i].info;
+		u64 info = lbr[i].info;
 		struct perf_branch_entry *e = &cpuc->lbr_entries[i];
 
-		e->from		= lbr->lbr[i].from;
-		e->to		= lbr->lbr[i].to;
+		e->from		= lbr[i].from;
+		e->to		= lbr[i].to;
 		e->mispred	= !!(info & LBR_INFO_MISPRED);
 		e->predicted	= !(info & LBR_INFO_MISPRED);
 		e->in_tx	= !!(info & LBR_INFO_IN_TX);
diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h
index ba89e563b2aa..aaa426d3d66e 100644
--- a/arch/x86/events/perf_event.h
+++ b/arch/x86/events/perf_event.h
@@ -765,13 +765,11 @@ struct x86_perf_task_context_opt {
 };
 
 struct x86_perf_task_context {
-	u64 lbr_from[MAX_LBR_ENTRIES];
-	u64 lbr_to[MAX_LBR_ENTRIES];
-	u64 lbr_info[MAX_LBR_ENTRIES];
 	u64 lbr_sel;
 	int tos;
 	int valid_lbrs;
 	struct x86_perf_task_context_opt opt;
+	struct lbr_entry lbr[MAX_LBR_ENTRIES];
 };
 
 #define x86_add_quirk(func_)						\
@@ -1092,7 +1090,7 @@ void intel_pmu_pebs_sched_task(struct perf_event_context *ctx, bool sched_in);
 
 void intel_pmu_auto_reload_read(struct perf_event *event);
 
-void intel_pmu_store_pebs_lbrs(struct pebs_lbr *lbr);
+void intel_pmu_store_pebs_lbrs(struct lbr_entry *lbr);
 
 void intel_ds_init(void);
 
diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h
index 9ffce7d31c4c..2e29558c9c6b 100644
--- a/arch/x86/include/asm/perf_event.h
+++ b/arch/x86/include/asm/perf_event.h
@@ -282,14 +282,10 @@ struct pebs_xmm {
 	u64 xmm[16*2];	/* two entries for each register */
 };
 
-struct pebs_lbr_entry {
+struct lbr_entry {
 	u64 from, to, info;
 };
 
-struct pebs_lbr {
-	struct pebs_lbr_entry lbr[0]; /* Variable length */
-};
-
 /*
  * IBS cpuid feature detection
  */

From 020d91e5f32da4f4b929b3a6e680135fd526107c Mon Sep 17 00:00:00 2001
From: Kan Liang <kan.liang@linux.intel.com>
Date: Fri, 3 Jul 2020 05:49:17 -0700
Subject: [PATCH 297/502] perf/x86/intel/lbr: Mark the {rd,wr}lbr_{to,from}
 wrappers __always_inline

The {rd,wr}lbr_{to,from} wrappers are invoked in hot paths, e.g. context
switch and NMI handler. They should be always inline to achieve better
performance. However, the CONFIG_OPTIMIZE_INLINING allows the compiler
to uninline functions marked 'inline'.

Mark the {rd,wr}lbr_{to,from} wrappers as __always_inline to force
inline the wrappers.

Suggested-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Kan Liang <kan.liang@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/1593780569-62993-12-git-send-email-kan.liang@linux.intel.com
---
 arch/x86/events/intel/lbr.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/arch/x86/events/intel/lbr.c b/arch/x86/events/intel/lbr.c
index b8baaf15c5f4..21f4f071f2c0 100644
--- a/arch/x86/events/intel/lbr.c
+++ b/arch/x86/events/intel/lbr.c
@@ -332,18 +332,18 @@ static u64 lbr_from_signext_quirk_rd(u64 val)
 	return val;
 }
 
-static inline void wrlbr_from(unsigned int idx, u64 val)
+static __always_inline void wrlbr_from(unsigned int idx, u64 val)
 {
 	val = lbr_from_signext_quirk_wr(val);
 	wrmsrl(x86_pmu.lbr_from + idx, val);
 }
 
-static inline void wrlbr_to(unsigned int idx, u64 val)
+static __always_inline void wrlbr_to(unsigned int idx, u64 val)
 {
 	wrmsrl(x86_pmu.lbr_to + idx, val);
 }
 
-static inline u64 rdlbr_from(unsigned int idx)
+static __always_inline u64 rdlbr_from(unsigned int idx)
 {
 	u64 val;
 
@@ -352,7 +352,7 @@ static inline u64 rdlbr_from(unsigned int idx)
 	return lbr_from_signext_quirk_rd(val);
 }
 
-static inline u64 rdlbr_to(unsigned int idx)
+static __always_inline u64 rdlbr_to(unsigned int idx)
 {
 	u64 val;
 

From fda1f99f34a8f0975086bcfef34da865009995c1 Mon Sep 17 00:00:00 2001
From: Kan Liang <kan.liang@linux.intel.com>
Date: Fri, 3 Jul 2020 05:49:18 -0700
Subject: [PATCH 298/502] perf/x86/intel/lbr: Factor out rdlbr_all() and
 wrlbr_all()

The previous model-specific LBR and Architecture LBR (legacy way) use a
similar method to save/restore the LBR information, which directly
accesses the LBR registers. The codes which read/write a set of LBR
registers can be shared between them.

Factor out two functions which are used to read/write a set of LBR
registers.

Add lbr_info into structure x86_pmu, and use it to replace the hardcoded
LBR INFO MSR, because the LBR INFO MSR address of the previous
model-specific LBR is different from Architecture LBR. The MSR address
should be assigned at boot time. For now, only Sky Lake and later
platforms have the LBR INFO MSR.

Signed-off-by: Kan Liang <kan.liang@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/1593780569-62993-13-git-send-email-kan.liang@linux.intel.com
---
 arch/x86/events/intel/lbr.c  | 66 +++++++++++++++++++++++++++---------
 arch/x86/events/perf_event.h |  2 +-
 2 files changed, 51 insertions(+), 17 deletions(-)

diff --git a/arch/x86/events/intel/lbr.c b/arch/x86/events/intel/lbr.c
index 21f4f071f2c0..d3d129c7d7ef 100644
--- a/arch/x86/events/intel/lbr.c
+++ b/arch/x86/events/intel/lbr.c
@@ -237,7 +237,7 @@ void intel_pmu_lbr_reset_64(void)
 		wrmsrl(x86_pmu.lbr_from + i, 0);
 		wrmsrl(x86_pmu.lbr_to   + i, 0);
 		if (x86_pmu.intel_cap.lbr_format == LBR_FORMAT_INFO)
-			wrmsrl(MSR_LBR_INFO_0 + i, 0);
+			wrmsrl(x86_pmu.lbr_info + i, 0);
 	}
 }
 
@@ -343,6 +343,11 @@ static __always_inline void wrlbr_to(unsigned int idx, u64 val)
 	wrmsrl(x86_pmu.lbr_to + idx, val);
 }
 
+static __always_inline void wrlbr_info(unsigned int idx, u64 val)
+{
+	wrmsrl(x86_pmu.lbr_info + idx, val);
+}
+
 static __always_inline u64 rdlbr_from(unsigned int idx)
 {
 	u64 val;
@@ -361,8 +366,44 @@ static __always_inline u64 rdlbr_to(unsigned int idx)
 	return val;
 }
 
+static __always_inline u64 rdlbr_info(unsigned int idx)
+{
+	u64 val;
+
+	rdmsrl(x86_pmu.lbr_info + idx, val);
+
+	return val;
+}
+
+static inline void
+wrlbr_all(struct lbr_entry *lbr, unsigned int idx, bool need_info)
+{
+	wrlbr_from(idx, lbr->from);
+	wrlbr_to(idx, lbr->to);
+	if (need_info)
+		wrlbr_info(idx, lbr->info);
+}
+
+static inline bool
+rdlbr_all(struct lbr_entry *lbr, unsigned int idx, bool need_info)
+{
+	u64 from = rdlbr_from(idx);
+
+	/* Don't read invalid entry */
+	if (!from)
+		return false;
+
+	lbr->from = from;
+	lbr->to = rdlbr_to(idx);
+	if (need_info)
+		lbr->info = rdlbr_info(idx);
+
+	return true;
+}
+
 void intel_pmu_lbr_restore(void *ctx)
 {
+	bool need_info = x86_pmu.intel_cap.lbr_format == LBR_FORMAT_INFO;
 	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
 	struct x86_perf_task_context *task_ctx = ctx;
 	int i;
@@ -372,11 +413,7 @@ void intel_pmu_lbr_restore(void *ctx)
 	mask = x86_pmu.lbr_nr - 1;
 	for (i = 0; i < task_ctx->valid_lbrs; i++) {
 		lbr_idx = (tos - i) & mask;
-		wrlbr_from(lbr_idx, task_ctx->lbr[i].from);
-		wrlbr_to(lbr_idx, task_ctx->lbr[i].to);
-
-		if (x86_pmu.intel_cap.lbr_format == LBR_FORMAT_INFO)
-			wrmsrl(MSR_LBR_INFO_0 + lbr_idx, task_ctx->lbr[i].info);
+		wrlbr_all(&task_ctx->lbr[i], lbr_idx, need_info);
 	}
 
 	for (; i < x86_pmu.lbr_nr; i++) {
@@ -384,7 +421,7 @@ void intel_pmu_lbr_restore(void *ctx)
 		wrlbr_from(lbr_idx, 0);
 		wrlbr_to(lbr_idx, 0);
 		if (x86_pmu.intel_cap.lbr_format == LBR_FORMAT_INFO)
-			wrmsrl(MSR_LBR_INFO_0 + lbr_idx, 0);
+			wrlbr_info(lbr_idx, 0);
 	}
 
 	wrmsrl(x86_pmu.lbr_tos, tos);
@@ -427,23 +464,19 @@ static void __intel_pmu_lbr_restore(void *ctx)
 
 void intel_pmu_lbr_save(void *ctx)
 {
+	bool need_info = x86_pmu.intel_cap.lbr_format == LBR_FORMAT_INFO;
 	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
 	struct x86_perf_task_context *task_ctx = ctx;
 	unsigned lbr_idx, mask;
-	u64 tos, from;
+	u64 tos;
 	int i;
 
 	mask = x86_pmu.lbr_nr - 1;
 	tos = intel_pmu_lbr_tos();
 	for (i = 0; i < x86_pmu.lbr_nr; i++) {
 		lbr_idx = (tos - i) & mask;
-		from = rdlbr_from(lbr_idx);
-		if (!from)
+		if (!rdlbr_all(&task_ctx->lbr[i], lbr_idx, need_info))
 			break;
-		task_ctx->lbr[i].from = from;
-		task_ctx->lbr[i].to = rdlbr_to(lbr_idx);
-		if (x86_pmu.intel_cap.lbr_format == LBR_FORMAT_INFO)
-			rdmsrl(MSR_LBR_INFO_0 + lbr_idx, task_ctx->lbr[i].info);
 	}
 	task_ctx->valid_lbrs = i;
 	task_ctx->tos = tos;
@@ -689,7 +722,7 @@ void intel_pmu_lbr_read_64(struct cpu_hw_events *cpuc)
 		if (lbr_format == LBR_FORMAT_INFO && need_info) {
 			u64 info;
 
-			rdmsrl(MSR_LBR_INFO_0 + lbr_idx, info);
+			info = rdlbr_info(lbr_idx);
 			mis = !!(info & LBR_INFO_MISPRED);
 			pred = !mis;
 			in_tx = !!(info & LBR_INFO_IN_TX);
@@ -1336,6 +1369,7 @@ __init void intel_pmu_lbr_init_skl(void)
 	x86_pmu.lbr_tos	 = MSR_LBR_TOS;
 	x86_pmu.lbr_from = MSR_LBR_NHM_FROM;
 	x86_pmu.lbr_to   = MSR_LBR_NHM_TO;
+	x86_pmu.lbr_info = MSR_LBR_INFO_0;
 
 	x86_pmu.lbr_sel_mask = LBR_SEL_MASK;
 	x86_pmu.lbr_sel_map  = hsw_lbr_sel_map;
@@ -1421,7 +1455,7 @@ int x86_perf_get_lbr(struct x86_pmu_lbr *lbr)
 	lbr->nr = x86_pmu.lbr_nr;
 	lbr->from = x86_pmu.lbr_from;
 	lbr->to = x86_pmu.lbr_to;
-	lbr->info = (lbr_fmt == LBR_FORMAT_INFO) ? MSR_LBR_INFO_0 : 0;
+	lbr->info = (lbr_fmt == LBR_FORMAT_INFO) ? x86_pmu.lbr_info : 0;
 
 	return 0;
 }
diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h
index aaa426d3d66e..20e35cb1705d 100644
--- a/arch/x86/events/perf_event.h
+++ b/arch/x86/events/perf_event.h
@@ -690,7 +690,7 @@ struct x86_pmu {
 	 * Intel LBR
 	 */
 	unsigned int	lbr_tos, lbr_from, lbr_to,
-			lbr_nr;			   /* LBR base regs and size */
+			lbr_info, lbr_nr;	   /* LBR base regs and size */
 	union {
 		u64	lbr_sel_mask;		   /* LBR_SELECT valid bits */
 		u64	lbr_ctl_mask;		   /* LBR_CTL valid bits */

From 631618a0dca31dc23dcce38cf345c6139bd8a1e9 Mon Sep 17 00:00:00 2001
From: Kan Liang <kan.liang@linux.intel.com>
Date: Fri, 3 Jul 2020 05:49:19 -0700
Subject: [PATCH 299/502] perf/x86/intel/lbr: Factor out intel_pmu_store_lbr

The way to store the LBR information from a PEBS LBR record can be
reused in Architecture LBR, because
- The LBR information is stored like a stack. Entry 0 is always the
  youngest branch.
- The layout of the LBR INFO MSR is similar.

The LBR information may be retrieved from either the LBR registers
(non-PEBS event) or a buffer (PEBS event). Extend rdlbr_*() to support
both methods.

Explicitly check the invalid entry (0s), which can avoid unnecessary MSR
access if using a non-PEBS event. For a PEBS event, the check should
slightly improve the performance as well. The invalid entries are cut.
The intel_pmu_lbr_filter() doesn't need to check and filter them out.

Cannot share the function with current model-specific LBR read, because
the direction of the LBR growth is opposite.

Signed-off-by: Kan Liang <kan.liang@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/1593780569-62993-14-git-send-email-kan.liang@linux.intel.com
---
 arch/x86/events/intel/lbr.c | 82 +++++++++++++++++++++++++------------
 1 file changed, 56 insertions(+), 26 deletions(-)

diff --git a/arch/x86/events/intel/lbr.c b/arch/x86/events/intel/lbr.c
index d3d129c7d7ef..0d7a85903964 100644
--- a/arch/x86/events/intel/lbr.c
+++ b/arch/x86/events/intel/lbr.c
@@ -348,28 +348,37 @@ static __always_inline void wrlbr_info(unsigned int idx, u64 val)
 	wrmsrl(x86_pmu.lbr_info + idx, val);
 }
 
-static __always_inline u64 rdlbr_from(unsigned int idx)
+static __always_inline u64 rdlbr_from(unsigned int idx, struct lbr_entry *lbr)
 {
 	u64 val;
 
+	if (lbr)
+		return lbr->from;
+
 	rdmsrl(x86_pmu.lbr_from + idx, val);
 
 	return lbr_from_signext_quirk_rd(val);
 }
 
-static __always_inline u64 rdlbr_to(unsigned int idx)
+static __always_inline u64 rdlbr_to(unsigned int idx, struct lbr_entry *lbr)
 {
 	u64 val;
 
+	if (lbr)
+		return lbr->to;
+
 	rdmsrl(x86_pmu.lbr_to + idx, val);
 
 	return val;
 }
 
-static __always_inline u64 rdlbr_info(unsigned int idx)
+static __always_inline u64 rdlbr_info(unsigned int idx, struct lbr_entry *lbr)
 {
 	u64 val;
 
+	if (lbr)
+		return lbr->info;
+
 	rdmsrl(x86_pmu.lbr_info + idx, val);
 
 	return val;
@@ -387,16 +396,16 @@ wrlbr_all(struct lbr_entry *lbr, unsigned int idx, bool need_info)
 static inline bool
 rdlbr_all(struct lbr_entry *lbr, unsigned int idx, bool need_info)
 {
-	u64 from = rdlbr_from(idx);
+	u64 from = rdlbr_from(idx, NULL);
 
 	/* Don't read invalid entry */
 	if (!from)
 		return false;
 
 	lbr->from = from;
-	lbr->to = rdlbr_to(idx);
+	lbr->to = rdlbr_to(idx, NULL);
 	if (need_info)
-		lbr->info = rdlbr_info(idx);
+		lbr->info = rdlbr_info(idx, NULL);
 
 	return true;
 }
@@ -432,7 +441,7 @@ void intel_pmu_lbr_restore(void *ctx)
 
 static __always_inline bool lbr_is_reset_in_cstate(void *ctx)
 {
-	return !rdlbr_from(((struct x86_perf_task_context *)ctx)->tos);
+	return !rdlbr_from(((struct x86_perf_task_context *)ctx)->tos, NULL);
 }
 
 static void __intel_pmu_lbr_restore(void *ctx)
@@ -709,8 +718,8 @@ void intel_pmu_lbr_read_64(struct cpu_hw_events *cpuc)
 		u16 cycles = 0;
 		int lbr_flags = lbr_desc[lbr_format];
 
-		from = rdlbr_from(lbr_idx);
-		to   = rdlbr_to(lbr_idx);
+		from = rdlbr_from(lbr_idx, NULL);
+		to   = rdlbr_to(lbr_idx, NULL);
 
 		/*
 		 * Read LBR call stack entries
@@ -722,7 +731,7 @@ void intel_pmu_lbr_read_64(struct cpu_hw_events *cpuc)
 		if (lbr_format == LBR_FORMAT_INFO && need_info) {
 			u64 info;
 
-			info = rdlbr_info(lbr_idx);
+			info = rdlbr_info(lbr_idx, NULL);
 			mis = !!(info & LBR_INFO_MISPRED);
 			pred = !mis;
 			in_tx = !!(info & LBR_INFO_IN_TX);
@@ -777,6 +786,42 @@ void intel_pmu_lbr_read_64(struct cpu_hw_events *cpuc)
 	cpuc->lbr_stack.hw_idx = tos;
 }
 
+static void intel_pmu_store_lbr(struct cpu_hw_events *cpuc,
+				struct lbr_entry *entries)
+{
+	struct perf_branch_entry *e;
+	struct lbr_entry *lbr;
+	u64 from, to, info;
+	int i;
+
+	for (i = 0; i < x86_pmu.lbr_nr; i++) {
+		lbr = entries ? &entries[i] : NULL;
+		e = &cpuc->lbr_entries[i];
+
+		from = rdlbr_from(i, lbr);
+		/*
+		 * Read LBR entries until invalid entry (0s) is detected.
+		 */
+		if (!from)
+			break;
+
+		to = rdlbr_to(i, lbr);
+		info = rdlbr_info(i, lbr);
+
+		e->from		= from;
+		e->to		= to;
+		e->mispred	= !!(info & LBR_INFO_MISPRED);
+		e->predicted	= !(info & LBR_INFO_MISPRED);
+		e->in_tx	= !!(info & LBR_INFO_IN_TX);
+		e->abort	= !!(info & LBR_INFO_ABORT);
+		e->cycles	= info & LBR_INFO_CYCLES;
+		e->type		= 0;
+		e->reserved	= 0;
+	}
+
+	cpuc->lbr_stack.nr = i;
+}
+
 void intel_pmu_lbr_read(void)
 {
 	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
@@ -1215,9 +1260,6 @@ intel_pmu_lbr_filter(struct cpu_hw_events *cpuc)
 void intel_pmu_store_pebs_lbrs(struct lbr_entry *lbr)
 {
 	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
-	int i;
-
-	cpuc->lbr_stack.nr = x86_pmu.lbr_nr;
 
 	/* Cannot get TOS for large PEBS */
 	if (cpuc->n_pebs == cpuc->n_large_pebs)
@@ -1225,19 +1267,7 @@ void intel_pmu_store_pebs_lbrs(struct lbr_entry *lbr)
 	else
 		cpuc->lbr_stack.hw_idx = intel_pmu_lbr_tos();
 
-	for (i = 0; i < x86_pmu.lbr_nr; i++) {
-		u64 info = lbr[i].info;
-		struct perf_branch_entry *e = &cpuc->lbr_entries[i];
-
-		e->from		= lbr[i].from;
-		e->to		= lbr[i].to;
-		e->mispred	= !!(info & LBR_INFO_MISPRED);
-		e->predicted	= !(info & LBR_INFO_MISPRED);
-		e->in_tx	= !!(info & LBR_INFO_IN_TX);
-		e->abort	= !!(info & LBR_INFO_ABORT);
-		e->cycles	= info & LBR_INFO_CYCLES;
-		e->reserved	= 0;
-	}
+	intel_pmu_store_lbr(cpuc, lbr);
 	intel_pmu_lbr_filter(cpuc);
 }
 

From 47125db27e47e9d44c878bf8925aa057824bb0d5 Mon Sep 17 00:00:00 2001
From: Kan Liang <kan.liang@linux.intel.com>
Date: Fri, 3 Jul 2020 05:49:20 -0700
Subject: [PATCH 300/502] perf/x86/intel/lbr: Support Architectural LBR

Last Branch Records (LBR) enables recording of software path history by
logging taken branches and other control flows within architectural
registers now. Intel CPUs have had model-specific LBR for quite some
time, but this evolves them into an architectural feature now.

The main improvements of Architectural LBR implemented includes:
- Linux kernel can support the LBR features without knowing the model
  number of the current CPU.
- Architectural LBR capabilities can be enumerated by CPUID. The
  lbr_ctl_map is based on the CPUID Enumeration.
- The possible LBR depth can be retrieved from CPUID enumeration. The
  max value is written to the new MSR_ARCH_LBR_DEPTH as the number of
  LBR entries.
- A new IA32_LBR_CTL MSR is introduced to enable and configure LBRs,
  which replaces the IA32_DEBUGCTL[bit 0] and the LBR_SELECT MSR.
- Each LBR record or entry is still comprised of three MSRs,
  IA32_LBR_x_FROM_IP, IA32_LBR_x_TO_IP and IA32_LBR_x_TO_IP.
  But they become the architectural MSRs.
- Architectural LBR is stack-like now. Entry 0 is always the youngest
  branch, entry 1 the next youngest... The TOS MSR has been removed.

The way to enable/disable Architectural LBR is similar to the previous
model-specific LBR. __intel_pmu_lbr_enable/disable() can be reused, but
some modifications are required, which include:
- MSR_ARCH_LBR_CTL is used to enable and configure the Architectural
  LBR.
- When checking the value of the IA32_DEBUGCTL MSR, ignoring the
  DEBUGCTLMSR_LBR (bit 0) for Architectural LBR, which has no meaning
  and always return 0.
- The FREEZE_LBRS_ON_PMI has to be explicitly set/clear, because
  MSR_IA32_DEBUGCTLMSR is not touched in __intel_pmu_lbr_disable() for
  Architectural LBR.
- Only MSR_ARCH_LBR_CTL is cleared in __intel_pmu_lbr_disable() for
  Architectural LBR.

Some Architectural LBR dedicated functions are implemented to
reset/read/save/restore LBR.
- For reset, writing to the ARCH_LBR_DEPTH MSR clears all Arch LBR
  entries, which is a lot faster and can improve the context switch
  latency.
- For read, the branch type information can be retrieved from
  the MSR_ARCH_LBR_INFO_*. But it's not fully compatible due to
  OTHER_BRANCH type. The software decoding is still required for the
  OTHER_BRANCH case.
  LBR records are stored in the age order as well. Reuse
  intel_pmu_store_lbr(). Check the CPUID enumeration before accessing
  the corresponding bits in LBR_INFO.
- For save/restore, applying the fast reset (writing ARCH_LBR_DEPTH).
  Reading 'lbr_from' of entry 0 instead of the TOS MSR to check if the
  LBR registers are reset in the deep C-state. If 'the deep C-state
  reset' bit is not set in CPUID enumeration, ignoring the check.
  XSAVE support for Architectural LBR will be implemented later.

The number of LBR entries cannot be hardcoded anymore, which should be
retrieved from CPUID enumeration. A new structure
x86_perf_task_context_arch_lbr is introduced for Architectural LBR.

Signed-off-by: Kan Liang <kan.liang@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/1593780569-62993-15-git-send-email-kan.liang@linux.intel.com
---
 arch/x86/events/intel/core.c |   3 +
 arch/x86/events/intel/lbr.c  | 251 +++++++++++++++++++++++++++++++++--
 arch/x86/events/perf_event.h |  10 ++
 3 files changed, 253 insertions(+), 11 deletions(-)

diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c
index 50cb3c69d6a4..50963472ee85 100644
--- a/arch/x86/events/intel/core.c
+++ b/arch/x86/events/intel/core.c
@@ -4664,6 +4664,9 @@ __init int intel_pmu_init(void)
 		x86_pmu.lbr_read = intel_pmu_lbr_read_32;
 	}
 
+	if (boot_cpu_has(X86_FEATURE_ARCH_LBR))
+		intel_pmu_arch_lbr_init();
+
 	intel_ds_init();
 
 	x86_add_quirk(intel_arch_events_quirk); /* Install first, so it runs last */
diff --git a/arch/x86/events/intel/lbr.c b/arch/x86/events/intel/lbr.c
index 0d7a85903964..e4e249a78451 100644
--- a/arch/x86/events/intel/lbr.c
+++ b/arch/x86/events/intel/lbr.c
@@ -172,6 +172,14 @@ enum {
 
 static void intel_pmu_lbr_filter(struct cpu_hw_events *cpuc);
 
+static __always_inline bool is_lbr_call_stack_bit_set(u64 config)
+{
+	if (static_cpu_has(X86_FEATURE_ARCH_LBR))
+		return !!(config & ARCH_LBR_CALL_STACK);
+
+	return !!(config & LBR_CALL_STACK);
+}
+
 /*
  * We only support LBR implementations that have FREEZE_LBRS_ON_PMI
  * otherwise it becomes near impossible to get a reliable stack.
@@ -195,27 +203,40 @@ static void __intel_pmu_lbr_enable(bool pmi)
 	 */
 	if (cpuc->lbr_sel)
 		lbr_select = cpuc->lbr_sel->config & x86_pmu.lbr_sel_mask;
-	if (!pmi && cpuc->lbr_sel)
+	if (!static_cpu_has(X86_FEATURE_ARCH_LBR) && !pmi && cpuc->lbr_sel)
 		wrmsrl(MSR_LBR_SELECT, lbr_select);
 
 	rdmsrl(MSR_IA32_DEBUGCTLMSR, debugctl);
 	orig_debugctl = debugctl;
-	debugctl |= DEBUGCTLMSR_LBR;
+
+	if (!static_cpu_has(X86_FEATURE_ARCH_LBR))
+		debugctl |= DEBUGCTLMSR_LBR;
 	/*
 	 * LBR callstack does not work well with FREEZE_LBRS_ON_PMI.
 	 * If FREEZE_LBRS_ON_PMI is set, PMI near call/return instructions
 	 * may cause superfluous increase/decrease of LBR_TOS.
 	 */
-	if (!(lbr_select & LBR_CALL_STACK))
+	if (is_lbr_call_stack_bit_set(lbr_select))
+		debugctl &= ~DEBUGCTLMSR_FREEZE_LBRS_ON_PMI;
+	else
 		debugctl |= DEBUGCTLMSR_FREEZE_LBRS_ON_PMI;
+
 	if (orig_debugctl != debugctl)
 		wrmsrl(MSR_IA32_DEBUGCTLMSR, debugctl);
+
+	if (static_cpu_has(X86_FEATURE_ARCH_LBR))
+		wrmsrl(MSR_ARCH_LBR_CTL, lbr_select | ARCH_LBR_CTL_LBREN);
 }
 
 static void __intel_pmu_lbr_disable(void)
 {
 	u64 debugctl;
 
+	if (static_cpu_has(X86_FEATURE_ARCH_LBR)) {
+		wrmsrl(MSR_ARCH_LBR_CTL, 0);
+		return;
+	}
+
 	rdmsrl(MSR_IA32_DEBUGCTLMSR, debugctl);
 	debugctl &= ~(DEBUGCTLMSR_LBR | DEBUGCTLMSR_FREEZE_LBRS_ON_PMI);
 	wrmsrl(MSR_IA32_DEBUGCTLMSR, debugctl);
@@ -241,6 +262,12 @@ void intel_pmu_lbr_reset_64(void)
 	}
 }
 
+static void intel_pmu_arch_lbr_reset(void)
+{
+	/* Write to ARCH_LBR_DEPTH MSR, all LBR entries are reset to 0 */
+	wrmsrl(MSR_ARCH_LBR_DEPTH, x86_pmu.lbr_nr);
+}
+
 void intel_pmu_lbr_reset(void)
 {
 	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
@@ -439,8 +466,28 @@ void intel_pmu_lbr_restore(void *ctx)
 		wrmsrl(MSR_LBR_SELECT, task_ctx->lbr_sel);
 }
 
+static void intel_pmu_arch_lbr_restore(void *ctx)
+{
+	struct x86_perf_task_context_arch_lbr *task_ctx = ctx;
+	struct lbr_entry *entries = task_ctx->entries;
+	int i;
+
+	/* Fast reset the LBRs before restore if the call stack is not full. */
+	if (!entries[x86_pmu.lbr_nr - 1].from)
+		intel_pmu_arch_lbr_reset();
+
+	for (i = 0; i < x86_pmu.lbr_nr; i++) {
+		if (!entries[i].from)
+			break;
+		wrlbr_all(&entries[i], i, true);
+	}
+}
+
 static __always_inline bool lbr_is_reset_in_cstate(void *ctx)
 {
+	if (static_cpu_has(X86_FEATURE_ARCH_LBR))
+		return x86_pmu.lbr_deep_c_reset && !rdlbr_from(0, NULL);
+
 	return !rdlbr_from(((struct x86_perf_task_context *)ctx)->tos, NULL);
 }
 
@@ -494,6 +541,22 @@ void intel_pmu_lbr_save(void *ctx)
 		rdmsrl(MSR_LBR_SELECT, task_ctx->lbr_sel);
 }
 
+static void intel_pmu_arch_lbr_save(void *ctx)
+{
+	struct x86_perf_task_context_arch_lbr *task_ctx = ctx;
+	struct lbr_entry *entries = task_ctx->entries;
+	int i;
+
+	for (i = 0; i < x86_pmu.lbr_nr; i++) {
+		if (!rdlbr_all(&entries[i], i, true))
+			break;
+	}
+
+	/* LBR call stack is not full. Reset is required in restore. */
+	if (i < x86_pmu.lbr_nr)
+		entries[x86_pmu.lbr_nr - 1].from = 0;
+}
+
 static void __intel_pmu_lbr_save(void *ctx)
 {
 	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
@@ -786,6 +849,39 @@ void intel_pmu_lbr_read_64(struct cpu_hw_events *cpuc)
 	cpuc->lbr_stack.hw_idx = tos;
 }
 
+static __always_inline int get_lbr_br_type(u64 info)
+{
+	if (!static_cpu_has(X86_FEATURE_ARCH_LBR) || !x86_pmu.lbr_br_type)
+		return 0;
+
+	return (info & LBR_INFO_BR_TYPE) >> LBR_INFO_BR_TYPE_OFFSET;
+}
+
+static __always_inline bool get_lbr_mispred(u64 info)
+{
+	if (static_cpu_has(X86_FEATURE_ARCH_LBR) && !x86_pmu.lbr_mispred)
+		return 0;
+
+	return !!(info & LBR_INFO_MISPRED);
+}
+
+static __always_inline bool get_lbr_predicted(u64 info)
+{
+	if (static_cpu_has(X86_FEATURE_ARCH_LBR) && !x86_pmu.lbr_mispred)
+		return 0;
+
+	return !(info & LBR_INFO_MISPRED);
+}
+
+static __always_inline bool get_lbr_cycles(u64 info)
+{
+	if (static_cpu_has(X86_FEATURE_ARCH_LBR) &&
+	    !(x86_pmu.lbr_timed_lbr && info & LBR_INFO_CYC_CNT_VALID))
+		return 0;
+
+	return info & LBR_INFO_CYCLES;
+}
+
 static void intel_pmu_store_lbr(struct cpu_hw_events *cpuc,
 				struct lbr_entry *entries)
 {
@@ -810,18 +906,23 @@ static void intel_pmu_store_lbr(struct cpu_hw_events *cpuc,
 
 		e->from		= from;
 		e->to		= to;
-		e->mispred	= !!(info & LBR_INFO_MISPRED);
-		e->predicted	= !(info & LBR_INFO_MISPRED);
+		e->mispred	= get_lbr_mispred(info);
+		e->predicted	= get_lbr_predicted(info);
 		e->in_tx	= !!(info & LBR_INFO_IN_TX);
 		e->abort	= !!(info & LBR_INFO_ABORT);
-		e->cycles	= info & LBR_INFO_CYCLES;
-		e->type		= 0;
+		e->cycles	= get_lbr_cycles(info);
+		e->type		= get_lbr_br_type(info);
 		e->reserved	= 0;
 	}
 
 	cpuc->lbr_stack.nr = i;
 }
 
+static void intel_pmu_arch_lbr_read(struct cpu_hw_events *cpuc)
+{
+	intel_pmu_store_lbr(cpuc, NULL);
+}
+
 void intel_pmu_lbr_read(void)
 {
 	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
@@ -1197,6 +1298,27 @@ common_branch_type(int type)
 	return PERF_BR_UNKNOWN;
 }
 
+enum {
+	ARCH_LBR_BR_TYPE_JCC			= 0,
+	ARCH_LBR_BR_TYPE_NEAR_IND_JMP		= 1,
+	ARCH_LBR_BR_TYPE_NEAR_REL_JMP		= 2,
+	ARCH_LBR_BR_TYPE_NEAR_IND_CALL		= 3,
+	ARCH_LBR_BR_TYPE_NEAR_REL_CALL		= 4,
+	ARCH_LBR_BR_TYPE_NEAR_RET		= 5,
+	ARCH_LBR_BR_TYPE_KNOWN_MAX		= ARCH_LBR_BR_TYPE_NEAR_RET,
+
+	ARCH_LBR_BR_TYPE_MAP_MAX		= 16,
+};
+
+static const int arch_lbr_br_type_map[ARCH_LBR_BR_TYPE_MAP_MAX] = {
+	[ARCH_LBR_BR_TYPE_JCC]			= X86_BR_JCC,
+	[ARCH_LBR_BR_TYPE_NEAR_IND_JMP]		= X86_BR_IND_JMP,
+	[ARCH_LBR_BR_TYPE_NEAR_REL_JMP]		= X86_BR_JMP,
+	[ARCH_LBR_BR_TYPE_NEAR_IND_CALL]	= X86_BR_IND_CALL,
+	[ARCH_LBR_BR_TYPE_NEAR_REL_CALL]	= X86_BR_CALL,
+	[ARCH_LBR_BR_TYPE_NEAR_RET]		= X86_BR_RET,
+};
+
 /*
  * implement actual branch filter based on user demand.
  * Hardware may not exactly satisfy that request, thus
@@ -1209,7 +1331,7 @@ intel_pmu_lbr_filter(struct cpu_hw_events *cpuc)
 {
 	u64 from, to;
 	int br_sel = cpuc->br_sel;
-	int i, j, type;
+	int i, j, type, to_plm;
 	bool compress = false;
 
 	/* if sampling all branches, then nothing to filter */
@@ -1221,8 +1343,19 @@ intel_pmu_lbr_filter(struct cpu_hw_events *cpuc)
 
 		from = cpuc->lbr_entries[i].from;
 		to = cpuc->lbr_entries[i].to;
+		type = cpuc->lbr_entries[i].type;
 
-		type = branch_type(from, to, cpuc->lbr_entries[i].abort);
+		/*
+		 * Parse the branch type recorded in LBR_x_INFO MSR.
+		 * Doesn't support OTHER_BRANCH decoding for now.
+		 * OTHER_BRANCH branch type still rely on software decoding.
+		 */
+		if (static_cpu_has(X86_FEATURE_ARCH_LBR) &&
+		    type <= ARCH_LBR_BR_TYPE_KNOWN_MAX) {
+			to_plm = kernel_ip(to) ? X86_BR_KERNEL : X86_BR_USER;
+			type = arch_lbr_br_type_map[type] | to_plm;
+		} else
+			type = branch_type(from, to, cpuc->lbr_entries[i].abort);
 		if (type != X86_BR_NONE && (br_sel & X86_BR_ANYTX)) {
 			if (cpuc->lbr_entries[i].in_tx)
 				type |= X86_BR_IN_TX;
@@ -1261,8 +1394,9 @@ void intel_pmu_store_pebs_lbrs(struct lbr_entry *lbr)
 {
 	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
 
-	/* Cannot get TOS for large PEBS */
-	if (cpuc->n_pebs == cpuc->n_large_pebs)
+	/* Cannot get TOS for large PEBS and Arch LBR */
+	if (static_cpu_has(X86_FEATURE_ARCH_LBR) ||
+	    (cpuc->n_pebs == cpuc->n_large_pebs))
 		cpuc->lbr_stack.hw_idx = -1ULL;
 	else
 		cpuc->lbr_stack.hw_idx = intel_pmu_lbr_tos();
@@ -1324,6 +1458,26 @@ static const int hsw_lbr_sel_map[PERF_SAMPLE_BRANCH_MAX_SHIFT] = {
 	[PERF_SAMPLE_BRANCH_CALL_SHIFT]		= LBR_REL_CALL,
 };
 
+static int arch_lbr_ctl_map[PERF_SAMPLE_BRANCH_MAX_SHIFT] = {
+	[PERF_SAMPLE_BRANCH_ANY_SHIFT]		= ARCH_LBR_ANY,
+	[PERF_SAMPLE_BRANCH_USER_SHIFT]		= ARCH_LBR_USER,
+	[PERF_SAMPLE_BRANCH_KERNEL_SHIFT]	= ARCH_LBR_KERNEL,
+	[PERF_SAMPLE_BRANCH_HV_SHIFT]		= LBR_IGN,
+	[PERF_SAMPLE_BRANCH_ANY_RETURN_SHIFT]	= ARCH_LBR_RETURN |
+						  ARCH_LBR_OTHER_BRANCH,
+	[PERF_SAMPLE_BRANCH_ANY_CALL_SHIFT]     = ARCH_LBR_REL_CALL |
+						  ARCH_LBR_IND_CALL |
+						  ARCH_LBR_OTHER_BRANCH,
+	[PERF_SAMPLE_BRANCH_IND_CALL_SHIFT]     = ARCH_LBR_IND_CALL,
+	[PERF_SAMPLE_BRANCH_COND_SHIFT]         = ARCH_LBR_JCC,
+	[PERF_SAMPLE_BRANCH_CALL_STACK_SHIFT]   = ARCH_LBR_REL_CALL |
+						  ARCH_LBR_IND_CALL |
+						  ARCH_LBR_RETURN |
+						  ARCH_LBR_CALL_STACK,
+	[PERF_SAMPLE_BRANCH_IND_JUMP_SHIFT]	= ARCH_LBR_IND_JMP,
+	[PERF_SAMPLE_BRANCH_CALL_SHIFT]		= ARCH_LBR_REL_CALL,
+};
+
 /* core */
 void __init intel_pmu_lbr_init_core(void)
 {
@@ -1471,6 +1625,81 @@ void intel_pmu_lbr_init_knl(void)
 		x86_pmu.intel_cap.lbr_format = LBR_FORMAT_EIP_FLAGS;
 }
 
+void __init intel_pmu_arch_lbr_init(void)
+{
+	union cpuid28_eax eax;
+	union cpuid28_ebx ebx;
+	union cpuid28_ecx ecx;
+	unsigned int unused_edx;
+	u64 lbr_nr;
+
+	/* Arch LBR Capabilities */
+	cpuid(28, &eax.full, &ebx.full, &ecx.full, &unused_edx);
+
+	lbr_nr = fls(eax.split.lbr_depth_mask) * 8;
+	if (!lbr_nr)
+		goto clear_arch_lbr;
+
+	/* Apply the max depth of Arch LBR */
+	if (wrmsrl_safe(MSR_ARCH_LBR_DEPTH, lbr_nr))
+		goto clear_arch_lbr;
+
+	x86_pmu.lbr_depth_mask = eax.split.lbr_depth_mask;
+	x86_pmu.lbr_deep_c_reset = eax.split.lbr_deep_c_reset;
+	x86_pmu.lbr_lip = eax.split.lbr_lip;
+	x86_pmu.lbr_cpl = ebx.split.lbr_cpl;
+	x86_pmu.lbr_filter = ebx.split.lbr_filter;
+	x86_pmu.lbr_call_stack = ebx.split.lbr_call_stack;
+	x86_pmu.lbr_mispred = ecx.split.lbr_mispred;
+	x86_pmu.lbr_timed_lbr = ecx.split.lbr_timed_lbr;
+	x86_pmu.lbr_br_type = ecx.split.lbr_br_type;
+	x86_pmu.lbr_nr = lbr_nr;
+
+	x86_get_pmu()->task_ctx_size = sizeof(struct x86_perf_task_context_arch_lbr) +
+				       lbr_nr * sizeof(struct lbr_entry);
+
+	x86_pmu.lbr_from = MSR_ARCH_LBR_FROM_0;
+	x86_pmu.lbr_to = MSR_ARCH_LBR_TO_0;
+	x86_pmu.lbr_info = MSR_ARCH_LBR_INFO_0;
+
+	/* LBR callstack requires both CPL and Branch Filtering support */
+	if (!x86_pmu.lbr_cpl ||
+	    !x86_pmu.lbr_filter ||
+	    !x86_pmu.lbr_call_stack)
+		arch_lbr_ctl_map[PERF_SAMPLE_BRANCH_CALL_STACK_SHIFT] = LBR_NOT_SUPP;
+
+	if (!x86_pmu.lbr_cpl) {
+		arch_lbr_ctl_map[PERF_SAMPLE_BRANCH_USER_SHIFT] = LBR_NOT_SUPP;
+		arch_lbr_ctl_map[PERF_SAMPLE_BRANCH_KERNEL_SHIFT] = LBR_NOT_SUPP;
+	} else if (!x86_pmu.lbr_filter) {
+		arch_lbr_ctl_map[PERF_SAMPLE_BRANCH_ANY_SHIFT] = LBR_NOT_SUPP;
+		arch_lbr_ctl_map[PERF_SAMPLE_BRANCH_ANY_RETURN_SHIFT] = LBR_NOT_SUPP;
+		arch_lbr_ctl_map[PERF_SAMPLE_BRANCH_ANY_CALL_SHIFT] = LBR_NOT_SUPP;
+		arch_lbr_ctl_map[PERF_SAMPLE_BRANCH_IND_CALL_SHIFT] = LBR_NOT_SUPP;
+		arch_lbr_ctl_map[PERF_SAMPLE_BRANCH_COND_SHIFT] = LBR_NOT_SUPP;
+		arch_lbr_ctl_map[PERF_SAMPLE_BRANCH_IND_JUMP_SHIFT] = LBR_NOT_SUPP;
+		arch_lbr_ctl_map[PERF_SAMPLE_BRANCH_CALL_SHIFT] = LBR_NOT_SUPP;
+	}
+
+	x86_pmu.lbr_ctl_mask = ARCH_LBR_CTL_MASK;
+	x86_pmu.lbr_ctl_map  = arch_lbr_ctl_map;
+
+	if (!x86_pmu.lbr_cpl && !x86_pmu.lbr_filter)
+		x86_pmu.lbr_ctl_map = NULL;
+
+	x86_pmu.lbr_reset = intel_pmu_arch_lbr_reset;
+	x86_pmu.lbr_read = intel_pmu_arch_lbr_read;
+	x86_pmu.lbr_save = intel_pmu_arch_lbr_save;
+	x86_pmu.lbr_restore = intel_pmu_arch_lbr_restore;
+
+	pr_cont("Architectural LBR, ");
+
+	return;
+
+clear_arch_lbr:
+	clear_cpu_cap(&boot_cpu_data, X86_FEATURE_ARCH_LBR);
+}
+
 /**
  * x86_perf_get_lbr - get the LBR records information
  *
diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h
index 20e35cb1705d..3f7c329374bb 100644
--- a/arch/x86/events/perf_event.h
+++ b/arch/x86/events/perf_event.h
@@ -772,6 +772,11 @@ struct x86_perf_task_context {
 	struct lbr_entry lbr[MAX_LBR_ENTRIES];
 };
 
+struct x86_perf_task_context_arch_lbr {
+	struct x86_perf_task_context_opt opt;
+	struct lbr_entry entries[];
+};
+
 #define x86_add_quirk(func_)						\
 do {									\
 	static struct x86_pmu_quirk __quirk __initdata = {		\
@@ -822,6 +827,9 @@ extern struct x86_pmu x86_pmu __read_mostly;
 
 static __always_inline struct x86_perf_task_context_opt *task_context_opt(void *ctx)
 {
+	if (static_cpu_has(X86_FEATURE_ARCH_LBR))
+		return &((struct x86_perf_task_context_arch_lbr *)ctx)->opt;
+
 	return &((struct x86_perf_task_context *)ctx)->opt;
 }
 
@@ -1141,6 +1149,8 @@ void intel_pmu_lbr_init_skl(void);
 
 void intel_pmu_lbr_init_knl(void);
 
+void intel_pmu_arch_lbr_init(void);
+
 void intel_pmu_pebs_data_source_nhm(void);
 
 void intel_pmu_pebs_data_source_skl(bool pmem);

From ff9ff926889dd8026b4ba55266a010c27f68604f Mon Sep 17 00:00:00 2001
From: Kan Liang <kan.liang@linux.intel.com>
Date: Fri, 3 Jul 2020 05:49:21 -0700
Subject: [PATCH 301/502] perf/core: Factor out functions to allocate/free the
 task_ctx_data

The method to allocate/free the task_ctx_data is going to be changed in
the following patch. Currently, the task_ctx_data is allocated/freed in
several different places. To avoid repeatedly modifying the same codes
in several different places, alloc_task_ctx_data() and
free_task_ctx_data() are factored out to allocate/free the
task_ctx_data. The modification only needs to be applied once.

Signed-off-by: Kan Liang <kan.liang@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/1593780569-62993-16-git-send-email-kan.liang@linux.intel.com
---
 kernel/events/core.c | 21 +++++++++++++++------
 1 file changed, 15 insertions(+), 6 deletions(-)

diff --git a/kernel/events/core.c b/kernel/events/core.c
index 9b8f92500833..75090403f942 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -1238,12 +1238,22 @@ static void get_ctx(struct perf_event_context *ctx)
 	refcount_inc(&ctx->refcount);
 }
 
+static void *alloc_task_ctx_data(struct pmu *pmu)
+{
+	return kzalloc(pmu->task_ctx_size, GFP_KERNEL);
+}
+
+static void free_task_ctx_data(struct pmu *pmu, void *task_ctx_data)
+{
+	kfree(task_ctx_data);
+}
+
 static void free_ctx(struct rcu_head *head)
 {
 	struct perf_event_context *ctx;
 
 	ctx = container_of(head, struct perf_event_context, rcu_head);
-	kfree(ctx->task_ctx_data);
+	free_task_ctx_data(ctx->pmu, ctx->task_ctx_data);
 	kfree(ctx);
 }
 
@@ -4471,7 +4481,7 @@ find_get_context(struct pmu *pmu, struct task_struct *task,
 		goto errout;
 
 	if (event->attach_state & PERF_ATTACH_TASK_DATA) {
-		task_ctx_data = kzalloc(pmu->task_ctx_size, GFP_KERNEL);
+		task_ctx_data = alloc_task_ctx_data(pmu);
 		if (!task_ctx_data) {
 			err = -ENOMEM;
 			goto errout;
@@ -4529,11 +4539,11 @@ retry:
 		}
 	}
 
-	kfree(task_ctx_data);
+	free_task_ctx_data(pmu, task_ctx_data);
 	return ctx;
 
 errout:
-	kfree(task_ctx_data);
+	free_task_ctx_data(pmu, task_ctx_data);
 	return ERR_PTR(err);
 }
 
@@ -12497,8 +12507,7 @@ inherit_event(struct perf_event *parent_event,
 	    !child_ctx->task_ctx_data) {
 		struct pmu *pmu = child_event->pmu;
 
-		child_ctx->task_ctx_data = kzalloc(pmu->task_ctx_size,
-						   GFP_KERNEL);
+		child_ctx->task_ctx_data = alloc_task_ctx_data(pmu);
 		if (!child_ctx->task_ctx_data) {
 			free_event(child_event);
 			return ERR_PTR(-ENOMEM);

From 217c2a633ebb36f1cc6d249f4ef2e4a809d46818 Mon Sep 17 00:00:00 2001
From: Kan Liang <kan.liang@linux.intel.com>
Date: Fri, 3 Jul 2020 05:49:22 -0700
Subject: [PATCH 302/502] perf/core: Use kmem_cache to allocate the PMU
 specific data

Currently, the PMU specific data task_ctx_data is allocated by the
function kzalloc() in the perf generic code. When there is no specific
alignment requirement for the task_ctx_data, the method works well for
now. However, there will be a problem once a specific alignment
requirement is introduced in future features, e.g., the Architecture LBR
XSAVE feature requires 64-byte alignment. If the specific alignment
requirement is not fulfilled, the XSAVE family of instructions will fail
to save/restore the xstate to/from the task_ctx_data.

The function kzalloc() itself only guarantees a natural alignment. A
new method to allocate the task_ctx_data has to be introduced, which
has to meet the requirements as below:
- must be a generic method can be used by different architectures,
  because the allocation of the task_ctx_data is implemented in the
  perf generic code;
- must be an alignment-guarantee method (The alignment requirement is
  not changed after the boot);
- must be able to allocate/free a buffer (smaller than a page size)
  dynamically;
- should not cause extra CPU overhead or space overhead.

Several options were considered as below:
- One option is to allocate a larger buffer for task_ctx_data. E.g.,
    ptr = kmalloc(size + alignment, GFP_KERNEL);
    ptr &= ~(alignment - 1);
  This option causes space overhead.
- Another option is to allocate the task_ctx_data in the PMU specific
  code. To do so, several function pointers have to be added. As a
  result, both the generic structure and the PMU specific structure
  will become bigger. Besides, extra function calls are added when
  allocating/freeing the buffer. This option will increase both the
  space overhead and CPU overhead.
- The third option is to use a kmem_cache to allocate a buffer for the
  task_ctx_data. The kmem_cache can be created with a specific alignment
  requirement by the PMU at boot time. A new pointer for kmem_cache has
  to be added in the generic struct pmu, which would be used to
  dynamically allocate a buffer for the task_ctx_data at run time.
  Although the new pointer is added to the struct pmu, the existing
  variable task_ctx_size is not required anymore. The size of the
  generic structure is kept the same.

The third option which meets all the aforementioned requirements is used
to replace kzalloc() for the PMU specific data allocation. A later patch
will remove the kzalloc() method and the related variables.

Signed-off-by: Kan Liang <kan.liang@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/1593780569-62993-17-git-send-email-kan.liang@linux.intel.com
---
 include/linux/perf_event.h | 5 +++++
 kernel/events/core.c       | 8 +++++++-
 2 files changed, 12 insertions(+), 1 deletion(-)

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 46fe5cfb5163..09915ae06d28 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -424,6 +424,11 @@ struct pmu {
 	 */
 	size_t				task_ctx_size;
 
+	/*
+	 * Kmem cache of PMU specific data
+	 */
+	struct kmem_cache		*task_ctx_cache;
+
 	/*
 	 * PMU specific parts of task perf event context (i.e. ctx->task_ctx_data)
 	 * can be synchronized using this function. See Intel LBR callstack support
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 75090403f942..30d9b3182369 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -1240,12 +1240,18 @@ static void get_ctx(struct perf_event_context *ctx)
 
 static void *alloc_task_ctx_data(struct pmu *pmu)
 {
+	if (pmu->task_ctx_cache)
+		return kmem_cache_zalloc(pmu->task_ctx_cache, GFP_KERNEL);
+
 	return kzalloc(pmu->task_ctx_size, GFP_KERNEL);
 }
 
 static void free_task_ctx_data(struct pmu *pmu, void *task_ctx_data)
 {
-	kfree(task_ctx_data);
+	if (pmu->task_ctx_cache && task_ctx_data)
+		kmem_cache_free(pmu->task_ctx_cache, task_ctx_data);
+	else
+		kfree(task_ctx_data);
 }
 
 static void free_ctx(struct rcu_head *head)

From 33cad284497cf40f55ad6029c06011de3538ebed Mon Sep 17 00:00:00 2001
From: Kan Liang <kan.liang@linux.intel.com>
Date: Fri, 3 Jul 2020 05:49:23 -0700
Subject: [PATCH 303/502] perf/x86/intel/lbr: Create kmem_cache for the LBR
 context data

A new kmem_cache method is introduced to allocate the PMU specific data
task_ctx_data, which requires the PMU specific code to create a
kmem_cache.

Currently, the task_ctx_data is only used by the Intel LBR call stack
feature, which is introduced since Haswell. The kmem_cache should be
only created for Haswell and later platforms. There is no alignment
requirement for the existing platforms.

Signed-off-by: Kan Liang <kan.liang@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/1593780569-62993-18-git-send-email-kan.liang@linux.intel.com
---
 arch/x86/events/intel/lbr.c | 21 +++++++++++++++++++--
 1 file changed, 19 insertions(+), 2 deletions(-)

diff --git a/arch/x86/events/intel/lbr.c b/arch/x86/events/intel/lbr.c
index e4e249a78451..e784c1d485ca 100644
--- a/arch/x86/events/intel/lbr.c
+++ b/arch/x86/events/intel/lbr.c
@@ -1531,9 +1531,17 @@ void __init intel_pmu_lbr_init_snb(void)
 	 */
 }
 
+static inline struct kmem_cache *
+create_lbr_kmem_cache(size_t size, size_t align)
+{
+	return kmem_cache_create("x86_lbr", size, align, 0, NULL);
+}
+
 /* haswell */
 void intel_pmu_lbr_init_hsw(void)
 {
+	size_t size = sizeof(struct x86_perf_task_context);
+
 	x86_pmu.lbr_nr	 = 16;
 	x86_pmu.lbr_tos	 = MSR_LBR_TOS;
 	x86_pmu.lbr_from = MSR_LBR_NHM_FROM;
@@ -1542,6 +1550,8 @@ void intel_pmu_lbr_init_hsw(void)
 	x86_pmu.lbr_sel_mask = LBR_SEL_MASK;
 	x86_pmu.lbr_sel_map  = hsw_lbr_sel_map;
 
+	x86_get_pmu()->task_ctx_cache = create_lbr_kmem_cache(size, 0);
+
 	if (lbr_from_signext_quirk_needed())
 		static_branch_enable(&lbr_from_quirk_key);
 }
@@ -1549,6 +1559,8 @@ void intel_pmu_lbr_init_hsw(void)
 /* skylake */
 __init void intel_pmu_lbr_init_skl(void)
 {
+	size_t size = sizeof(struct x86_perf_task_context);
+
 	x86_pmu.lbr_nr	 = 32;
 	x86_pmu.lbr_tos	 = MSR_LBR_TOS;
 	x86_pmu.lbr_from = MSR_LBR_NHM_FROM;
@@ -1558,6 +1570,8 @@ __init void intel_pmu_lbr_init_skl(void)
 	x86_pmu.lbr_sel_mask = LBR_SEL_MASK;
 	x86_pmu.lbr_sel_map  = hsw_lbr_sel_map;
 
+	x86_get_pmu()->task_ctx_cache = create_lbr_kmem_cache(size, 0);
+
 	/*
 	 * SW branch filter usage:
 	 * - support syscall, sysret capture.
@@ -1631,6 +1645,7 @@ void __init intel_pmu_arch_lbr_init(void)
 	union cpuid28_ebx ebx;
 	union cpuid28_ecx ecx;
 	unsigned int unused_edx;
+	size_t size;
 	u64 lbr_nr;
 
 	/* Arch LBR Capabilities */
@@ -1655,8 +1670,10 @@ void __init intel_pmu_arch_lbr_init(void)
 	x86_pmu.lbr_br_type = ecx.split.lbr_br_type;
 	x86_pmu.lbr_nr = lbr_nr;
 
-	x86_get_pmu()->task_ctx_size = sizeof(struct x86_perf_task_context_arch_lbr) +
-				       lbr_nr * sizeof(struct lbr_entry);
+	size = sizeof(struct x86_perf_task_context_arch_lbr) +
+	       lbr_nr * sizeof(struct lbr_entry);
+	x86_get_pmu()->task_ctx_size = size;
+	x86_get_pmu()->task_ctx_cache = create_lbr_kmem_cache(size, 0);
 
 	x86_pmu.lbr_from = MSR_ARCH_LBR_FROM_0;
 	x86_pmu.lbr_to = MSR_ARCH_LBR_TO_0;

From 5a09928d339f3cf0973991ddc3a2798825c84c99 Mon Sep 17 00:00:00 2001
From: Kan Liang <kan.liang@linux.intel.com>
Date: Fri, 3 Jul 2020 05:49:24 -0700
Subject: [PATCH 304/502] perf/x86: Remove task_ctx_size

A new kmem_cache method has replaced the kzalloc() to allocate the PMU
specific data. The task_ctx_size is not required anymore.

Signed-off-by: Kan Liang <kan.liang@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/1593780569-62993-19-git-send-email-kan.liang@linux.intel.com
---
 arch/x86/events/core.c      | 1 -
 arch/x86/events/intel/lbr.c | 1 -
 include/linux/perf_event.h  | 4 ----
 kernel/events/core.c        | 4 +---
 4 files changed, 1 insertion(+), 9 deletions(-)

diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c
index d740c861724c..6b1228ae007d 100644
--- a/arch/x86/events/core.c
+++ b/arch/x86/events/core.c
@@ -2371,7 +2371,6 @@ static struct pmu pmu = {
 
 	.event_idx		= x86_pmu_event_idx,
 	.sched_task		= x86_pmu_sched_task,
-	.task_ctx_size          = sizeof(struct x86_perf_task_context),
 	.swap_task_ctx		= x86_pmu_swap_task_ctx,
 	.check_period		= x86_pmu_check_period,
 
diff --git a/arch/x86/events/intel/lbr.c b/arch/x86/events/intel/lbr.c
index e784c1d485ca..3ad528996d1c 100644
--- a/arch/x86/events/intel/lbr.c
+++ b/arch/x86/events/intel/lbr.c
@@ -1672,7 +1672,6 @@ void __init intel_pmu_arch_lbr_init(void)
 
 	size = sizeof(struct x86_perf_task_context_arch_lbr) +
 	       lbr_nr * sizeof(struct lbr_entry);
-	x86_get_pmu()->task_ctx_size = size;
 	x86_get_pmu()->task_ctx_cache = create_lbr_kmem_cache(size, 0);
 
 	x86_pmu.lbr_from = MSR_ARCH_LBR_FROM_0;
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 09915ae06d28..3b22db08b6fb 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -419,10 +419,6 @@ struct pmu {
 	 */
 	void (*sched_task)		(struct perf_event_context *ctx,
 					bool sched_in);
-	/*
-	 * PMU specific data size
-	 */
-	size_t				task_ctx_size;
 
 	/*
 	 * Kmem cache of PMU specific data
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 30d9b3182369..7c436d705fbd 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -1243,15 +1243,13 @@ static void *alloc_task_ctx_data(struct pmu *pmu)
 	if (pmu->task_ctx_cache)
 		return kmem_cache_zalloc(pmu->task_ctx_cache, GFP_KERNEL);
 
-	return kzalloc(pmu->task_ctx_size, GFP_KERNEL);
+	return NULL;
 }
 
 static void free_task_ctx_data(struct pmu *pmu, void *task_ctx_data)
 {
 	if (pmu->task_ctx_cache && task_ctx_data)
 		kmem_cache_free(pmu->task_ctx_cache, task_ctx_data);
-	else
-		kfree(task_ctx_data);
 }
 
 static void free_ctx(struct rcu_head *head)

From a063bf249b9f8d8004f282031781322c1b527d13 Mon Sep 17 00:00:00 2001
From: Kan Liang <kan.liang@linux.intel.com>
Date: Fri, 3 Jul 2020 05:49:25 -0700
Subject: [PATCH 305/502] x86/fpu: Use proper mask to replace full instruction
 mask

When saving xstate to a kernel/user XSAVE area with the XSAVE family of
instructions, the current code applies the 'full' instruction mask (-1),
which tries to XSAVE all possible features. This method relies on
hardware to trim 'all possible' down to what is enabled in the
hardware. The code works well for now. However, there will be a
problem, if some features are enabled in hardware, but are not suitable
to be saved into all kernel XSAVE buffers, like task->fpu, due to
performance consideration.

One such example is the Last Branch Records (LBR) state. The LBR state
only contains valuable information when LBR is explicitly enabled by
the perf subsystem, and the size of an LBR state is large (808 bytes
for now). To avoid both CPU overhead and space overhead at each context
switch, the LBR state should not be saved into task->fpu like other
state components. It should be saved/restored on demand when LBR is
enabled in the perf subsystem. Current copy_xregs_to_* will trigger a
buffer overflow for such cases.

Three sites use the '-1' instruction mask which must be updated.

Two are saving/restoring the xstate to/from a kernel-allocated XSAVE
buffer and can use 'xfeatures_mask_all', which will save/restore all of
the features present in a normal task FPU buffer.

The last one saves the register state directly to a user buffer. It
could
also use 'xfeatures_mask_all'. Just as it was with the '-1' argument,
any supervisor states in the mask will be filtered out by the hardware
and not saved to the buffer.  But, to be more explicit about what is
expected to be saved, use xfeatures_mask_user() for the instruction
mask.

KVM includes the header file fpu/internal.h. To avoid 'undefined
xfeatures_mask_all' compiling issue, move copy_fpregs_to_fpstate() to
fpu/core.c and export it, because:
- The xfeatures_mask_all is indirectly used via copy_fpregs_to_fpstate()
  by KVM. The function which is directly used by other modules should be
  exported.
- The copy_fpregs_to_fpstate() is a function, while xfeatures_mask_all
  is a variable for the "internal" FPU state. It's safer to export a
  function than a variable, which may be implicitly changed by others.
- The copy_fpregs_to_fpstate() is a big function with many checks. The
  removal of the inline keyword should not impact the performance.

Signed-off-by: Kan Liang <kan.liang@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Dave Hansen <dave.hansen@intel.com>
Link: https://lkml.kernel.org/r/1593780569-62993-20-git-send-email-kan.liang@linux.intel.com
---
 arch/x86/include/asm/fpu/internal.h | 47 +++++------------------------
 arch/x86/kernel/fpu/core.c          | 39 ++++++++++++++++++++++++
 2 files changed, 46 insertions(+), 40 deletions(-)

diff --git a/arch/x86/include/asm/fpu/internal.h b/arch/x86/include/asm/fpu/internal.h
index 42159f45bf9c..d3724dc8c5d2 100644
--- a/arch/x86/include/asm/fpu/internal.h
+++ b/arch/x86/include/asm/fpu/internal.h
@@ -274,7 +274,7 @@ static inline void copy_fxregs_to_kernel(struct fpu *fpu)
  */
 static inline void copy_xregs_to_kernel_booting(struct xregs_state *xstate)
 {
-	u64 mask = -1;
+	u64 mask = xfeatures_mask_all;
 	u32 lmask = mask;
 	u32 hmask = mask >> 32;
 	int err;
@@ -320,7 +320,7 @@ static inline void copy_kernel_to_xregs_booting(struct xregs_state *xstate)
  */
 static inline void copy_xregs_to_kernel(struct xregs_state *xstate)
 {
-	u64 mask = -1;
+	u64 mask = xfeatures_mask_all;
 	u32 lmask = mask;
 	u32 hmask = mask >> 32;
 	int err;
@@ -356,6 +356,9 @@ static inline void copy_kernel_to_xregs(struct xregs_state *xstate, u64 mask)
  */
 static inline int copy_xregs_to_user(struct xregs_state __user *buf)
 {
+	u64 mask = xfeatures_mask_user();
+	u32 lmask = mask;
+	u32 hmask = mask >> 32;
 	int err;
 
 	/*
@@ -367,7 +370,7 @@ static inline int copy_xregs_to_user(struct xregs_state __user *buf)
 		return -EFAULT;
 
 	stac();
-	XSTATE_OP(XSAVE, buf, -1, -1, err);
+	XSTATE_OP(XSAVE, buf, lmask, hmask, err);
 	clac();
 
 	return err;
@@ -408,43 +411,7 @@ static inline int copy_kernel_to_xregs_err(struct xregs_state *xstate, u64 mask)
 	return err;
 }
 
-/*
- * These must be called with preempt disabled. Returns
- * 'true' if the FPU state is still intact and we can
- * keep registers active.
- *
- * The legacy FNSAVE instruction cleared all FPU state
- * unconditionally, so registers are essentially destroyed.
- * Modern FPU state can be kept in registers, if there are
- * no pending FP exceptions.
- */
-static inline int copy_fpregs_to_fpstate(struct fpu *fpu)
-{
-	if (likely(use_xsave())) {
-		copy_xregs_to_kernel(&fpu->state.xsave);
-
-		/*
-		 * AVX512 state is tracked here because its use is
-		 * known to slow the max clock speed of the core.
-		 */
-		if (fpu->state.xsave.header.xfeatures & XFEATURE_MASK_AVX512)
-			fpu->avx512_timestamp = jiffies;
-		return 1;
-	}
-
-	if (likely(use_fxsr())) {
-		copy_fxregs_to_kernel(fpu);
-		return 1;
-	}
-
-	/*
-	 * Legacy FPU register saving, FNSAVE always clears FPU registers,
-	 * so we have to mark them inactive:
-	 */
-	asm volatile("fnsave %[fp]; fwait" : [fp] "=m" (fpu->state.fsave));
-
-	return 0;
-}
+extern int copy_fpregs_to_fpstate(struct fpu *fpu);
 
 static inline void __copy_kernel_to_fpregs(union fpregs_state *fpstate, u64 mask)
 {
diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c
index 06c818967bb6..1bb7532f5f34 100644
--- a/arch/x86/kernel/fpu/core.c
+++ b/arch/x86/kernel/fpu/core.c
@@ -82,6 +82,45 @@ bool irq_fpu_usable(void)
 }
 EXPORT_SYMBOL(irq_fpu_usable);
 
+/*
+ * These must be called with preempt disabled. Returns
+ * 'true' if the FPU state is still intact and we can
+ * keep registers active.
+ *
+ * The legacy FNSAVE instruction cleared all FPU state
+ * unconditionally, so registers are essentially destroyed.
+ * Modern FPU state can be kept in registers, if there are
+ * no pending FP exceptions.
+ */
+int copy_fpregs_to_fpstate(struct fpu *fpu)
+{
+	if (likely(use_xsave())) {
+		copy_xregs_to_kernel(&fpu->state.xsave);
+
+		/*
+		 * AVX512 state is tracked here because its use is
+		 * known to slow the max clock speed of the core.
+		 */
+		if (fpu->state.xsave.header.xfeatures & XFEATURE_MASK_AVX512)
+			fpu->avx512_timestamp = jiffies;
+		return 1;
+	}
+
+	if (likely(use_fxsr())) {
+		copy_fxregs_to_kernel(fpu);
+		return 1;
+	}
+
+	/*
+	 * Legacy FPU register saving, FNSAVE always clears FPU registers,
+	 * so we have to mark them inactive:
+	 */
+	asm volatile("fnsave %[fp]; fwait" : [fp] "=m" (fpu->state.fsave));
+
+	return 0;
+}
+EXPORT_SYMBOL(copy_fpregs_to_fpstate);
+
 void kernel_fpu_begin(void)
 {
 	preempt_disable();

From f0dccc9da4c0fda049e99326f85db8c242fd781f Mon Sep 17 00:00:00 2001
From: Kan Liang <kan.liang@linux.intel.com>
Date: Fri, 3 Jul 2020 05:49:26 -0700
Subject: [PATCH 306/502] x86/fpu/xstate: Support dynamic supervisor feature
 for LBR

Last Branch Records (LBR) registers are used to log taken branches and
other control flows. In perf with call stack mode, LBR information is
used to reconstruct a call stack. To get the complete call stack, perf
has to save/restore all LBR registers during a context switch. Due to
the large number of the LBR registers, e.g., the current platform has
96 LBR registers, this process causes a high CPU overhead. To reduce
the CPU overhead during a context switch, an LBR state component that
contains all the LBR related registers is introduced in hardware. All
LBR registers can be saved/restored together using one XSAVES/XRSTORS
instruction.

However, the kernel should not save/restore the LBR state component at
each context switch, like other state components, because of the
following unique features of LBR:
- The LBR state component only contains valuable information when LBR
  is enabled in the perf subsystem, but for most of the time, LBR is
  disabled.
- The size of the LBR state component is huge. For the current
  platform, it's 808 bytes.
If the kernel saves/restores the LBR state at each context switch, for
most of the time, it is just a waste of space and cycles.

To efficiently support the LBR state component, it is desired to have:
- only context-switch the LBR when the LBR feature is enabled in perf.
- only allocate an LBR-specific XSAVE buffer on demand.
  (Besides the LBR state, a legacy region and an XSAVE header have to be
   included in the buffer as well. There is a total of (808+576) byte
   overhead for the LBR-specific XSAVE buffer. The overhead only happens
   when the perf is actively using LBRs. There is still a space-saving,
   on average, when it replaces the constant 808 bytes of overhead for
   every task, all the time on the systems that support architectural
   LBR.)
- be able to use XSAVES/XRSTORS for accessing LBR at run time.
  However, the IA32_XSS should not be adjusted at run time.
  (The XCR0 | IA32_XSS are used to determine the requested-feature
  bitmap (RFBM) of XSAVES.)

A solution, called dynamic supervisor feature, is introduced to address
this issue, which
- does not allocate a buffer in each task->fpu;
- does not save/restore a state component at each context switch;
- sets the bit corresponding to the dynamic supervisor feature in
  IA32_XSS at boot time, and avoids setting it at run time.
- dynamically allocates a specific buffer for a state component
  on demand, e.g. only allocates LBR-specific XSAVE buffer when LBR is
  enabled in perf. (Note: The buffer has to include the LBR state
  component, a legacy region and a XSAVE header space.)
  (Implemented in a later patch)
- saves/restores a state component on demand, e.g. manually invokes
  the XSAVES/XRSTORS instruction to save/restore the LBR state
  to/from the buffer when perf is active and a call stack is required.
  (Implemented in a later patch)

A new mask XFEATURE_MASK_DYNAMIC and a helper xfeatures_mask_dynamic()
are introduced to indicate the dynamic supervisor feature. For the
systems which support the Architecture LBR, LBR is the only dynamic
supervisor feature for now. For the previous systems, there is no
dynamic supervisor feature available.

Signed-off-by: Kan Liang <kan.liang@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Dave Hansen <dave.hansen@intel.com>
Link: https://lkml.kernel.org/r/1593780569-62993-21-git-send-email-kan.liang@linux.intel.com
---
 arch/x86/include/asm/fpu/types.h  |  7 +++++++
 arch/x86/include/asm/fpu/xstate.h | 30 ++++++++++++++++++++++++++++++
 arch/x86/kernel/fpu/xstate.c      | 15 ++++++++++-----
 3 files changed, 47 insertions(+), 5 deletions(-)

diff --git a/arch/x86/include/asm/fpu/types.h b/arch/x86/include/asm/fpu/types.h
index f098f6cab94b..132e9cc26d60 100644
--- a/arch/x86/include/asm/fpu/types.h
+++ b/arch/x86/include/asm/fpu/types.h
@@ -114,6 +114,12 @@ enum xfeature {
 	XFEATURE_Hi16_ZMM,
 	XFEATURE_PT_UNIMPLEMENTED_SO_FAR,
 	XFEATURE_PKRU,
+	XFEATURE_RSRVD_COMP_10,
+	XFEATURE_RSRVD_COMP_11,
+	XFEATURE_RSRVD_COMP_12,
+	XFEATURE_RSRVD_COMP_13,
+	XFEATURE_RSRVD_COMP_14,
+	XFEATURE_LBR,
 
 	XFEATURE_MAX,
 };
@@ -128,6 +134,7 @@ enum xfeature {
 #define XFEATURE_MASK_Hi16_ZMM		(1 << XFEATURE_Hi16_ZMM)
 #define XFEATURE_MASK_PT		(1 << XFEATURE_PT_UNIMPLEMENTED_SO_FAR)
 #define XFEATURE_MASK_PKRU		(1 << XFEATURE_PKRU)
+#define XFEATURE_MASK_LBR		(1 << XFEATURE_LBR)
 
 #define XFEATURE_MASK_FPSSE		(XFEATURE_MASK_FP | XFEATURE_MASK_SSE)
 #define XFEATURE_MASK_AVX512		(XFEATURE_MASK_OPMASK \
diff --git a/arch/x86/include/asm/fpu/xstate.h b/arch/x86/include/asm/fpu/xstate.h
index 422d8369012a..040c4d49bfcb 100644
--- a/arch/x86/include/asm/fpu/xstate.h
+++ b/arch/x86/include/asm/fpu/xstate.h
@@ -35,6 +35,27 @@
 /* All currently supported supervisor features */
 #define XFEATURE_MASK_SUPERVISOR_SUPPORTED (0)
 
+/*
+ * A supervisor state component may not always contain valuable information,
+ * and its size may be huge. Saving/restoring such supervisor state components
+ * at each context switch can cause high CPU and space overhead, which should
+ * be avoided. Such supervisor state components should only be saved/restored
+ * on demand. The on-demand dynamic supervisor features are set in this mask.
+ *
+ * Unlike the existing supported supervisor features, a dynamic supervisor
+ * feature does not allocate a buffer in task->fpu, and the corresponding
+ * supervisor state component cannot be saved/restored at each context switch.
+ *
+ * To support a dynamic supervisor feature, a developer should follow the
+ * dos and don'ts as below:
+ * - Do dynamically allocate a buffer for the supervisor state component.
+ * - Do manually invoke the XSAVES/XRSTORS instruction to save/restore the
+ *   state component to/from the buffer.
+ * - Don't set the bit corresponding to the dynamic supervisor feature in
+ *   IA32_XSS at run time, since it has been set at boot time.
+ */
+#define XFEATURE_MASK_DYNAMIC (XFEATURE_MASK_LBR)
+
 /*
  * Unsupported supervisor features. When a supervisor feature in this mask is
  * supported in the future, move it to the supported supervisor feature mask.
@@ -43,6 +64,7 @@
 
 /* All supervisor states including supported and unsupported states. */
 #define XFEATURE_MASK_SUPERVISOR_ALL (XFEATURE_MASK_SUPERVISOR_SUPPORTED | \
+				      XFEATURE_MASK_DYNAMIC | \
 				      XFEATURE_MASK_SUPERVISOR_UNSUPPORTED)
 
 #ifdef CONFIG_X86_64
@@ -63,6 +85,14 @@ static inline u64 xfeatures_mask_user(void)
 	return xfeatures_mask_all & XFEATURE_MASK_USER_SUPPORTED;
 }
 
+static inline u64 xfeatures_mask_dynamic(void)
+{
+	if (!boot_cpu_has(X86_FEATURE_ARCH_LBR))
+		return XFEATURE_MASK_DYNAMIC & ~XFEATURE_MASK_LBR;
+
+	return XFEATURE_MASK_DYNAMIC;
+}
+
 extern u64 xstate_fx_sw_bytes[USER_XSTATE_FX_SW_WORDS];
 
 extern void __init update_regset_xstate_info(unsigned int size,
diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c
index bda2e5eaca0e..dcf062442b18 100644
--- a/arch/x86/kernel/fpu/xstate.c
+++ b/arch/x86/kernel/fpu/xstate.c
@@ -233,8 +233,10 @@ void fpu__init_cpu_xstate(void)
 	/*
 	 * MSR_IA32_XSS sets supervisor states managed by XSAVES.
 	 */
-	if (boot_cpu_has(X86_FEATURE_XSAVES))
-		wrmsrl(MSR_IA32_XSS, xfeatures_mask_supervisor());
+	if (boot_cpu_has(X86_FEATURE_XSAVES)) {
+		wrmsrl(MSR_IA32_XSS, xfeatures_mask_supervisor() |
+				     xfeatures_mask_dynamic());
+	}
 }
 
 static bool xfeature_enabled(enum xfeature xfeature)
@@ -598,7 +600,8 @@ static void check_xstate_against_struct(int nr)
 	 */
 	if ((nr < XFEATURE_YMM) ||
 	    (nr >= XFEATURE_MAX) ||
-	    (nr == XFEATURE_PT_UNIMPLEMENTED_SO_FAR)) {
+	    (nr == XFEATURE_PT_UNIMPLEMENTED_SO_FAR) ||
+	    ((nr >= XFEATURE_RSRVD_COMP_10) && (nr <= XFEATURE_LBR))) {
 		WARN_ONCE(1, "no structure for xstate: %d\n", nr);
 		XSTATE_WARN_ON(1);
 	}
@@ -847,8 +850,10 @@ void fpu__resume_cpu(void)
 	 * Restore IA32_XSS. The same CPUID bit enumerates support
 	 * of XSAVES and MSR_IA32_XSS.
 	 */
-	if (boot_cpu_has(X86_FEATURE_XSAVES))
-		wrmsrl(MSR_IA32_XSS, xfeatures_mask_supervisor());
+	if (boot_cpu_has(X86_FEATURE_XSAVES)) {
+		wrmsrl(MSR_IA32_XSS, xfeatures_mask_supervisor()  |
+				     xfeatures_mask_dynamic());
+	}
 }
 
 /*

From 50f408d96d4d1a945d2c50c5fd8ed400883edf0e Mon Sep 17 00:00:00 2001
From: Kan Liang <kan.liang@linux.intel.com>
Date: Fri, 3 Jul 2020 05:49:27 -0700
Subject: [PATCH 307/502] x86/fpu/xstate: Add helpers for LBR dynamic
 supervisor feature

The perf subsystem will only need to save/restore the LBR state.
However, the existing helpers save all supported supervisor states to a
kernel buffer, which will be unnecessary. Two helpers are introduced to
only save/restore requested dynamic supervisor states. The supervisor
features in XFEATURE_MASK_SUPERVISOR_SUPPORTED and
XFEATURE_MASK_SUPERVISOR_UNSUPPORTED mask cannot be saved/restored using
these helpers.

The helpers will be used in the following patch.

Signed-off-by: Kan Liang <kan.liang@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Dave Hansen <dave.hansen@intel.com>
Link: https://lkml.kernel.org/r/1593780569-62993-22-git-send-email-kan.liang@linux.intel.com
---
 arch/x86/include/asm/fpu/xstate.h |  3 ++
 arch/x86/kernel/fpu/xstate.c      | 72 +++++++++++++++++++++++++++++++
 2 files changed, 75 insertions(+)

diff --git a/arch/x86/include/asm/fpu/xstate.h b/arch/x86/include/asm/fpu/xstate.h
index 040c4d49bfcb..c029fce627cf 100644
--- a/arch/x86/include/asm/fpu/xstate.h
+++ b/arch/x86/include/asm/fpu/xstate.h
@@ -106,6 +106,9 @@ int copy_xstate_to_user(void __user *ubuf, struct xregs_state *xsave, unsigned i
 int copy_kernel_to_xstate(struct xregs_state *xsave, const void *kbuf);
 int copy_user_to_xstate(struct xregs_state *xsave, const void __user *ubuf);
 void copy_supervisor_to_kernel(struct xregs_state *xsave);
+void copy_dynamic_supervisor_to_kernel(struct xregs_state *xstate, u64 mask);
+void copy_kernel_to_dynamic_supervisor(struct xregs_state *xstate, u64 mask);
+
 
 /* Validate an xstate header supplied by userspace (ptrace or sigreturn) */
 int validate_user_xstate_header(const struct xstate_header *hdr);
diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c
index dcf062442b18..b0c22b7dae0a 100644
--- a/arch/x86/kernel/fpu/xstate.c
+++ b/arch/x86/kernel/fpu/xstate.c
@@ -1361,6 +1361,78 @@ void copy_supervisor_to_kernel(struct xregs_state *xstate)
 	}
 }
 
+/**
+ * copy_dynamic_supervisor_to_kernel() - Save dynamic supervisor states to
+ *                                       an xsave area
+ * @xstate: A pointer to an xsave area
+ * @mask: Represent the dynamic supervisor features saved into the xsave area
+ *
+ * Only the dynamic supervisor states sets in the mask are saved into the xsave
+ * area (See the comment in XFEATURE_MASK_DYNAMIC for the details of dynamic
+ * supervisor feature). Besides the dynamic supervisor states, the legacy
+ * region and XSAVE header are also saved into the xsave area. The supervisor
+ * features in the XFEATURE_MASK_SUPERVISOR_SUPPORTED and
+ * XFEATURE_MASK_SUPERVISOR_UNSUPPORTED are not saved.
+ *
+ * The xsave area must be 64-bytes aligned.
+ */
+void copy_dynamic_supervisor_to_kernel(struct xregs_state *xstate, u64 mask)
+{
+	u64 dynamic_mask = xfeatures_mask_dynamic() & mask;
+	u32 lmask, hmask;
+	int err;
+
+	if (WARN_ON_FPU(!boot_cpu_has(X86_FEATURE_XSAVES)))
+		return;
+
+	if (WARN_ON_FPU(!dynamic_mask))
+		return;
+
+	lmask = dynamic_mask;
+	hmask = dynamic_mask >> 32;
+
+	XSTATE_OP(XSAVES, xstate, lmask, hmask, err);
+
+	/* Should never fault when copying to a kernel buffer */
+	WARN_ON_FPU(err);
+}
+
+/**
+ * copy_kernel_to_dynamic_supervisor() - Restore dynamic supervisor states from
+ *                                       an xsave area
+ * @xstate: A pointer to an xsave area
+ * @mask: Represent the dynamic supervisor features restored from the xsave area
+ *
+ * Only the dynamic supervisor states sets in the mask are restored from the
+ * xsave area (See the comment in XFEATURE_MASK_DYNAMIC for the details of
+ * dynamic supervisor feature). Besides the dynamic supervisor states, the
+ * legacy region and XSAVE header are also restored from the xsave area. The
+ * supervisor features in the XFEATURE_MASK_SUPERVISOR_SUPPORTED and
+ * XFEATURE_MASK_SUPERVISOR_UNSUPPORTED are not restored.
+ *
+ * The xsave area must be 64-bytes aligned.
+ */
+void copy_kernel_to_dynamic_supervisor(struct xregs_state *xstate, u64 mask)
+{
+	u64 dynamic_mask = xfeatures_mask_dynamic() & mask;
+	u32 lmask, hmask;
+	int err;
+
+	if (WARN_ON_FPU(!boot_cpu_has(X86_FEATURE_XSAVES)))
+		return;
+
+	if (WARN_ON_FPU(!dynamic_mask))
+		return;
+
+	lmask = dynamic_mask;
+	hmask = dynamic_mask >> 32;
+
+	XSTATE_OP(XRSTORS, xstate, lmask, hmask, err);
+
+	/* Should never fault when copying from a kernel buffer */
+	WARN_ON_FPU(err);
+}
+
 #ifdef CONFIG_PROC_PID_ARCH_STATUS
 /*
  * Report the amount of time elapsed in millisecond since last AVX512

From ce711ea3cab9ad325d849792d442848e553095b8 Mon Sep 17 00:00:00 2001
From: Kan Liang <kan.liang@linux.intel.com>
Date: Fri, 3 Jul 2020 05:49:28 -0700
Subject: [PATCH 308/502] perf/x86/intel/lbr: Support XSAVES/XRSTORS for LBR
 context switch

In the LBR call stack mode, LBR information is used to reconstruct a
call stack. To get the complete call stack, perf has to save/restore
all LBR registers during a context switch. Due to a large number of the
LBR registers, this process causes a high CPU overhead. To reduce the
CPU overhead during a context switch, use the XSAVES/XRSTORS
instructions.

Every XSAVE area must follow a canonical format: the legacy region, an
XSAVE header and the extended region. Although the LBR information is
only kept in the extended region, a space for the legacy region and
XSAVE header is still required. Add a new dedicated structure for LBR
XSAVES support.

Before enabling XSAVES support, the size of the LBR state has to be
sanity checked, because:
- the size of the software structure is calculated from the max number
of the LBR depth, which is enumerated by the CPUID leaf for Arch LBR.
The size of the LBR state is enumerated by the CPUID leaf for XSAVE
support of Arch LBR. If the values from the two CPUID leaves are not
consistent, it may trigger a buffer overflow. For example, a hypervisor
may unconsciously set inconsistent values for the two emulated CPUID.
- unlike other state components, the size of an LBR state depends on the
max number of LBRs, which may vary from generation to generation.

Expose the function xfeature_size() for the sanity check.
The LBR XSAVES support will be disabled if the size of the LBR state
enumerated by CPUID doesn't match with the size of the software
structure.

The XSAVE instruction requires 64-byte alignment for state buffers. A
new macro is added to reflect the alignment requirement. A 64-byte
aligned kmem_cache is created for architecture LBR.

Currently, the structure for each state component is maintained in
fpu/types.h. The structure for the new LBR state component should be
maintained in the same place. Move structure lbr_entry to fpu/types.h as
well for broader sharing.

Add dedicated lbr_save/lbr_restore functions for LBR XSAVES support,
which invokes the corresponding xstate helpers to XSAVES/XRSTORS LBR
information at the context switch when the call stack mode is enabled.
Since the XSAVES/XRSTORS instructions will be eventually invoked, the
dedicated functions is named with '_xsaves'/'_xrstors' postfix.

Signed-off-by: Kan Liang <kan.liang@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Dave Hansen <dave.hansen@intel.com>
Link: https://lkml.kernel.org/r/1593780569-62993-23-git-send-email-kan.liang@linux.intel.com
---
 arch/x86/events/intel/lbr.c       | 79 +++++++++++++++++++++++++++++--
 arch/x86/events/perf_event.h      | 21 ++++++++
 arch/x86/include/asm/fpu/types.h  | 20 ++++++++
 arch/x86/include/asm/fpu/xstate.h |  3 ++
 arch/x86/include/asm/perf_event.h |  4 --
 arch/x86/kernel/fpu/xstate.c      |  2 +-
 6 files changed, 119 insertions(+), 10 deletions(-)

diff --git a/arch/x86/events/intel/lbr.c b/arch/x86/events/intel/lbr.c
index 3ad528996d1c..cb1a0495339b 100644
--- a/arch/x86/events/intel/lbr.c
+++ b/arch/x86/events/intel/lbr.c
@@ -483,6 +483,17 @@ static void intel_pmu_arch_lbr_restore(void *ctx)
 	}
 }
 
+/*
+ * Restore the Architecture LBR state from the xsave area in the perf
+ * context data for the task via the XRSTORS instruction.
+ */
+static void intel_pmu_arch_lbr_xrstors(void *ctx)
+{
+	struct x86_perf_task_context_arch_lbr_xsave *task_ctx = ctx;
+
+	copy_kernel_to_dynamic_supervisor(&task_ctx->xsave, XFEATURE_MASK_LBR);
+}
+
 static __always_inline bool lbr_is_reset_in_cstate(void *ctx)
 {
 	if (static_cpu_has(X86_FEATURE_ARCH_LBR))
@@ -557,6 +568,17 @@ static void intel_pmu_arch_lbr_save(void *ctx)
 		entries[x86_pmu.lbr_nr - 1].from = 0;
 }
 
+/*
+ * Save the Architecture LBR state to the xsave area in the perf
+ * context data for the task via the XSAVES instruction.
+ */
+static void intel_pmu_arch_lbr_xsaves(void *ctx)
+{
+	struct x86_perf_task_context_arch_lbr_xsave *task_ctx = ctx;
+
+	copy_dynamic_supervisor_to_kernel(&task_ctx->xsave, XFEATURE_MASK_LBR);
+}
+
 static void __intel_pmu_lbr_save(void *ctx)
 {
 	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
@@ -1639,12 +1661,40 @@ void intel_pmu_lbr_init_knl(void)
 		x86_pmu.intel_cap.lbr_format = LBR_FORMAT_EIP_FLAGS;
 }
 
+/*
+ * LBR state size is variable based on the max number of registers.
+ * This calculates the expected state size, which should match
+ * what the hardware enumerates for the size of XFEATURE_LBR.
+ */
+static inline unsigned int get_lbr_state_size(void)
+{
+	return sizeof(struct arch_lbr_state) +
+	       x86_pmu.lbr_nr * sizeof(struct lbr_entry);
+}
+
+static bool is_arch_lbr_xsave_available(void)
+{
+	if (!boot_cpu_has(X86_FEATURE_XSAVES))
+		return false;
+
+	/*
+	 * Check the LBR state with the corresponding software structure.
+	 * Disable LBR XSAVES support if the size doesn't match.
+	 */
+	if (WARN_ON(xfeature_size(XFEATURE_LBR) != get_lbr_state_size()))
+		return false;
+
+	return true;
+}
+
 void __init intel_pmu_arch_lbr_init(void)
 {
+	struct pmu *pmu = x86_get_pmu();
 	union cpuid28_eax eax;
 	union cpuid28_ebx ebx;
 	union cpuid28_ecx ecx;
 	unsigned int unused_edx;
+	bool arch_lbr_xsave;
 	size_t size;
 	u64 lbr_nr;
 
@@ -1670,9 +1720,22 @@ void __init intel_pmu_arch_lbr_init(void)
 	x86_pmu.lbr_br_type = ecx.split.lbr_br_type;
 	x86_pmu.lbr_nr = lbr_nr;
 
-	size = sizeof(struct x86_perf_task_context_arch_lbr) +
-	       lbr_nr * sizeof(struct lbr_entry);
-	x86_get_pmu()->task_ctx_cache = create_lbr_kmem_cache(size, 0);
+
+	arch_lbr_xsave = is_arch_lbr_xsave_available();
+	if (arch_lbr_xsave) {
+		size = sizeof(struct x86_perf_task_context_arch_lbr_xsave) +
+		       get_lbr_state_size();
+		pmu->task_ctx_cache = create_lbr_kmem_cache(size,
+							    XSAVE_ALIGNMENT);
+	}
+
+	if (!pmu->task_ctx_cache) {
+		arch_lbr_xsave = false;
+
+		size = sizeof(struct x86_perf_task_context_arch_lbr) +
+		       lbr_nr * sizeof(struct lbr_entry);
+		pmu->task_ctx_cache = create_lbr_kmem_cache(size, 0);
+	}
 
 	x86_pmu.lbr_from = MSR_ARCH_LBR_FROM_0;
 	x86_pmu.lbr_to = MSR_ARCH_LBR_TO_0;
@@ -1705,8 +1768,14 @@ void __init intel_pmu_arch_lbr_init(void)
 
 	x86_pmu.lbr_reset = intel_pmu_arch_lbr_reset;
 	x86_pmu.lbr_read = intel_pmu_arch_lbr_read;
-	x86_pmu.lbr_save = intel_pmu_arch_lbr_save;
-	x86_pmu.lbr_restore = intel_pmu_arch_lbr_restore;
+	if (arch_lbr_xsave) {
+		x86_pmu.lbr_save = intel_pmu_arch_lbr_xsaves;
+		x86_pmu.lbr_restore = intel_pmu_arch_lbr_xrstors;
+		pr_cont("XSAVE ");
+	} else {
+		x86_pmu.lbr_save = intel_pmu_arch_lbr_save;
+		x86_pmu.lbr_restore = intel_pmu_arch_lbr_restore;
+	}
 
 	pr_cont("Architectural LBR, ");
 
diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h
index 3f7c329374bb..d5e351c1f3c1 100644
--- a/arch/x86/events/perf_event.h
+++ b/arch/x86/events/perf_event.h
@@ -777,6 +777,27 @@ struct x86_perf_task_context_arch_lbr {
 	struct lbr_entry entries[];
 };
 
+/*
+ * Add padding to guarantee the 64-byte alignment of the state buffer.
+ *
+ * The structure is dynamically allocated. The size of the LBR state may vary
+ * based on the number of LBR registers.
+ *
+ * Do not put anything after the LBR state.
+ */
+struct x86_perf_task_context_arch_lbr_xsave {
+	struct x86_perf_task_context_opt		opt;
+
+	union {
+		struct xregs_state			xsave;
+		struct {
+			struct fxregs_state		i387;
+			struct xstate_header		header;
+			struct arch_lbr_state		lbr;
+		} __attribute__ ((packed, aligned (XSAVE_ALIGNMENT)));
+	};
+};
+
 #define x86_add_quirk(func_)						\
 do {									\
 	static struct x86_pmu_quirk __quirk __initdata = {		\
diff --git a/arch/x86/include/asm/fpu/types.h b/arch/x86/include/asm/fpu/types.h
index 132e9cc26d60..c87364ea6446 100644
--- a/arch/x86/include/asm/fpu/types.h
+++ b/arch/x86/include/asm/fpu/types.h
@@ -236,6 +236,26 @@ struct pkru_state {
 	u32				pad;
 } __packed;
 
+/*
+ * State component 15: Architectural LBR configuration state.
+ * The size of Arch LBR state depends on the number of LBRs (lbr_depth).
+ */
+
+struct lbr_entry {
+	u64 from;
+	u64 to;
+	u64 info;
+};
+
+struct arch_lbr_state {
+	u64 lbr_ctl;
+	u64 lbr_depth;
+	u64 ler_from;
+	u64 ler_to;
+	u64 ler_info;
+	struct lbr_entry		entries[];
+} __packed;
+
 struct xstate_header {
 	u64				xfeatures;
 	u64				xcomp_bv;
diff --git a/arch/x86/include/asm/fpu/xstate.h b/arch/x86/include/asm/fpu/xstate.h
index c029fce627cf..1559554af931 100644
--- a/arch/x86/include/asm/fpu/xstate.h
+++ b/arch/x86/include/asm/fpu/xstate.h
@@ -21,6 +21,8 @@
 #define XSAVE_YMM_SIZE	    256
 #define XSAVE_YMM_OFFSET    (XSAVE_HDR_SIZE + XSAVE_HDR_OFFSET)
 
+#define XSAVE_ALIGNMENT     64
+
 /* All currently supported user features */
 #define XFEATURE_MASK_USER_SUPPORTED (XFEATURE_MASK_FP | \
 				      XFEATURE_MASK_SSE | \
@@ -101,6 +103,7 @@ extern void __init update_regset_xstate_info(unsigned int size,
 void *get_xsave_addr(struct xregs_state *xsave, int xfeature_nr);
 const void *get_xsave_field_ptr(int xfeature_nr);
 int using_compacted_format(void);
+int xfeature_size(int xfeature_nr);
 int copy_xstate_to_kernel(void *kbuf, struct xregs_state *xsave, unsigned int offset, unsigned int size);
 int copy_xstate_to_user(void __user *ubuf, struct xregs_state *xsave, unsigned int offset, unsigned int size);
 int copy_kernel_to_xstate(struct xregs_state *xsave, const void *kbuf);
diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h
index 2e29558c9c6b..0c1b13720525 100644
--- a/arch/x86/include/asm/perf_event.h
+++ b/arch/x86/include/asm/perf_event.h
@@ -282,10 +282,6 @@ struct pebs_xmm {
 	u64 xmm[16*2];	/* two entries for each register */
 };
 
-struct lbr_entry {
-	u64 from, to, info;
-};
-
 /*
  * IBS cpuid feature detection
  */
diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c
index b0c22b7dae0a..10cf8789c378 100644
--- a/arch/x86/kernel/fpu/xstate.c
+++ b/arch/x86/kernel/fpu/xstate.c
@@ -488,7 +488,7 @@ static int xfeature_uncompacted_offset(int xfeature_nr)
 	return ebx;
 }
 
-static int xfeature_size(int xfeature_nr)
+int xfeature_size(int xfeature_nr)
 {
 	u32 eax, ebx, ecx, edx;
 

From c085fb8774671e83f6199a8e838fbc0e57094029 Mon Sep 17 00:00:00 2001
From: Kan Liang <kan.liang@linux.intel.com>
Date: Fri, 3 Jul 2020 05:49:29 -0700
Subject: [PATCH 309/502] perf/x86/intel/lbr: Support XSAVES for arch LBR read

Reading LBR registers in a perf NMI handler for a non-PEBS event
causes a high overhead because the number of LBR registers is huge.
To reduce the overhead, the XSAVES instruction should be used to replace
the LBR registers' reading method.

The XSAVES buffer used for LBR read has to be per-CPU because the NMI
handler invoked the lbr_read(). The existing task_ctx_data buffer
cannot be used which is per-task and only be allocated for the LBR call
stack mode. A new lbr_xsave pointer is introduced in the cpu_hw_events
as an XSAVES buffer for LBR read.

The XSAVES buffer should be allocated only when LBR is used by a
non-PEBS event on the CPU because the total size of the lbr_xsave is
not small (~1.4KB).

The XSAVES buffer is allocated when a non-PEBS event is added, but it
is lazily released in x86_release_hardware() when perf releases the
entire PMU hardware resource, because perf may frequently schedule the
event, e.g. high context switch. The lazy release method reduces the
overhead of frequently allocate/free the buffer.

If the lbr_xsave fails to be allocated, roll back to normal Arch LBR
lbr_read().

Signed-off-by: Kan Liang <kan.liang@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Dave Hansen <dave.hansen@intel.com>
Link: https://lkml.kernel.org/r/1593780569-62993-24-git-send-email-kan.liang@linux.intel.com
---
 arch/x86/events/core.c       |  1 +
 arch/x86/events/intel/lbr.c  | 40 +++++++++++++++++++++++++++++++++++-
 arch/x86/events/perf_event.h |  7 +++++++
 3 files changed, 47 insertions(+), 1 deletion(-)

diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c
index 6b1228ae007d..1cbf57dc2ac8 100644
--- a/arch/x86/events/core.c
+++ b/arch/x86/events/core.c
@@ -358,6 +358,7 @@ void x86_release_hardware(void)
 	if (atomic_dec_and_mutex_lock(&pmc_refcount, &pmc_reserve_mutex)) {
 		release_pmc_hardware();
 		release_ds_buffers();
+		release_lbr_buffers();
 		mutex_unlock(&pmc_reserve_mutex);
 	}
 }
diff --git a/arch/x86/events/intel/lbr.c b/arch/x86/events/intel/lbr.c
index cb1a0495339b..63f58bdf556c 100644
--- a/arch/x86/events/intel/lbr.c
+++ b/arch/x86/events/intel/lbr.c
@@ -658,6 +658,7 @@ static inline bool branch_user_callstack(unsigned br_sel)
 
 void intel_pmu_lbr_add(struct perf_event *event)
 {
+	struct kmem_cache *kmem_cache = event->pmu->task_ctx_cache;
 	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
 
 	if (!x86_pmu.lbr_nr)
@@ -695,6 +696,29 @@ void intel_pmu_lbr_add(struct perf_event *event)
 	perf_sched_cb_inc(event->ctx->pmu);
 	if (!cpuc->lbr_users++ && !event->total_time_running)
 		intel_pmu_lbr_reset();
+
+	if (static_cpu_has(X86_FEATURE_ARCH_LBR) &&
+	    kmem_cache && !cpuc->lbr_xsave &&
+	    (cpuc->lbr_users != cpuc->lbr_pebs_users))
+		cpuc->lbr_xsave = kmem_cache_alloc(kmem_cache, GFP_KERNEL);
+}
+
+void release_lbr_buffers(void)
+{
+	struct kmem_cache *kmem_cache = x86_get_pmu()->task_ctx_cache;
+	struct cpu_hw_events *cpuc;
+	int cpu;
+
+	if (!static_cpu_has(X86_FEATURE_ARCH_LBR))
+		return;
+
+	for_each_possible_cpu(cpu) {
+		cpuc = per_cpu_ptr(&cpu_hw_events, cpu);
+		if (kmem_cache && cpuc->lbr_xsave) {
+			kmem_cache_free(kmem_cache, cpuc->lbr_xsave);
+			cpuc->lbr_xsave = NULL;
+		}
+	}
 }
 
 void intel_pmu_lbr_del(struct perf_event *event)
@@ -945,6 +969,19 @@ static void intel_pmu_arch_lbr_read(struct cpu_hw_events *cpuc)
 	intel_pmu_store_lbr(cpuc, NULL);
 }
 
+static void intel_pmu_arch_lbr_read_xsave(struct cpu_hw_events *cpuc)
+{
+	struct x86_perf_task_context_arch_lbr_xsave *xsave = cpuc->lbr_xsave;
+
+	if (!xsave) {
+		intel_pmu_store_lbr(cpuc, NULL);
+		return;
+	}
+	copy_dynamic_supervisor_to_kernel(&xsave->xsave, XFEATURE_MASK_LBR);
+
+	intel_pmu_store_lbr(cpuc, xsave->lbr.entries);
+}
+
 void intel_pmu_lbr_read(void)
 {
 	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
@@ -1767,14 +1804,15 @@ void __init intel_pmu_arch_lbr_init(void)
 		x86_pmu.lbr_ctl_map = NULL;
 
 	x86_pmu.lbr_reset = intel_pmu_arch_lbr_reset;
-	x86_pmu.lbr_read = intel_pmu_arch_lbr_read;
 	if (arch_lbr_xsave) {
 		x86_pmu.lbr_save = intel_pmu_arch_lbr_xsaves;
 		x86_pmu.lbr_restore = intel_pmu_arch_lbr_xrstors;
+		x86_pmu.lbr_read = intel_pmu_arch_lbr_read_xsave;
 		pr_cont("XSAVE ");
 	} else {
 		x86_pmu.lbr_save = intel_pmu_arch_lbr_save;
 		x86_pmu.lbr_restore = intel_pmu_arch_lbr_restore;
+		x86_pmu.lbr_read = intel_pmu_arch_lbr_read;
 	}
 
 	pr_cont("Architectural LBR, ");
diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h
index d5e351c1f3c1..7b68ab5f19e7 100644
--- a/arch/x86/events/perf_event.h
+++ b/arch/x86/events/perf_event.h
@@ -253,6 +253,7 @@ struct cpu_hw_events {
 	void				*last_task_ctx;
 	int				last_log_id;
 	int				lbr_select;
+	void				*lbr_xsave;
 
 	/*
 	 * Intel host/guest exclude bits
@@ -1066,6 +1067,8 @@ void release_ds_buffers(void);
 
 void reserve_ds_buffers(void);
 
+void release_lbr_buffers(void);
+
 extern struct event_constraint bts_constraint;
 extern struct event_constraint vlbr_constraint;
 
@@ -1207,6 +1210,10 @@ static inline void release_ds_buffers(void)
 {
 }
 
+static inline void release_lbr_buffers(void)
+{
+}
+
 static inline int intel_pmu_init(void)
 {
 	return 0;

From aa340845ae6f019e0a12321a1741c14679bb0664 Mon Sep 17 00:00:00 2001
From: Dan Carpenter <dan.carpenter@oracle.com>
Date: Wed, 8 Jul 2020 21:47:11 +0300
Subject: [PATCH 310/502] io_uring: fix a use after free in
 io_async_task_func()

The "apoll" variable is freed and then used on the next line.  We need
to move the free down a few lines.

Fixes: 0be0b0e33b0b ("io_uring: simplify io_async_task_func()")
Signed-off-by: Dan Carpenter <dan.carpenter@oracle.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 4c9a494c9f9f..14168fbc7d79 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -4655,12 +4655,13 @@ static void io_async_task_func(struct callback_head *cb)
 	/* restore ->work in case we need to retry again */
 	if (req->flags & REQ_F_WORK_INITIALIZED)
 		memcpy(&req->work, &apoll->work, sizeof(req->work));
-	kfree(apoll);
 
 	if (!READ_ONCE(apoll->poll.canceled))
 		__io_req_task_submit(req);
 	else
 		__io_req_task_cancel(req, -ECANCELED);
+
+	kfree(apoll);
 }
 
 static int io_async_wake(struct wait_queue_entry *wait, unsigned mode, int sync,

From 5acbbc8ed3a9aef71c6eb5f19ba24f7321200220 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Wed, 8 Jul 2020 15:15:26 -0600
Subject: [PATCH 311/502] io_uring: only call kfree() for a non-zero pointer

It's safe to call kfree() with a NULL pointer, but it's also pointless.
Most of the time we don't have any data to free, and at millions of
requests per second, the redundant function call adds noticeable
overhead (about 1.3% of the runtime).

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 14168fbc7d79..51ff88330f9a 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -1519,7 +1519,8 @@ static void io_dismantle_req(struct io_kiocb *req)
 	if (req->flags & REQ_F_NEED_CLEANUP)
 		io_cleanup_req(req);
 
-	kfree(req->io);
+	if (req->io)
+		kfree(req->io);
 	if (req->file)
 		io_put_file(req, req->file, (req->flags & REQ_F_FIXED_FILE));
 	__io_put_req_task(req);

From 2bc9930e78fe0cb3e7b7e3169de0a40baee38d29 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Thu, 9 Jul 2020 09:43:27 -0600
Subject: [PATCH 312/502] io_uring: get rid of __req_need_defer()

We just have one caller of this, req_need_defer(), just inline the
code in there instead.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 16 ++++++----------
 1 file changed, 6 insertions(+), 10 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 51ff88330f9a..7f2a2cb5c056 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -1069,18 +1069,14 @@ err:
 	return NULL;
 }
 
-static inline bool __req_need_defer(struct io_kiocb *req)
-{
-	struct io_ring_ctx *ctx = req->ctx;
-
-	return req->sequence != ctx->cached_cq_tail
-				+ atomic_read(&ctx->cached_cq_overflow);
-}
-
 static inline bool req_need_defer(struct io_kiocb *req)
 {
-	if (unlikely(req->flags & REQ_F_IO_DRAIN))
-		return __req_need_defer(req);
+	if (unlikely(req->flags & REQ_F_IO_DRAIN)) {
+		struct io_ring_ctx *ctx = req->ctx;
+
+		return req->sequence != ctx->cached_cq_tail
+					+ atomic_read(&ctx->cached_cq_overflow);
+	}
 
 	return false;
 }

From 4349f30ecb8068d146a1e57bb12f46e745323b4c Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Thu, 9 Jul 2020 15:07:01 -0600
Subject: [PATCH 313/502] io_uring: remove dead 'ctx' argument and move forward
 declaration

We don't use 'ctx' at all in io_sq_thread_drop_mm(), it just works
on the mm of the current task. Drop the argument.

Move io_file_put_work() to where we have the other forward declarations
of functions.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 7f2a2cb5c056..3ce02a1613cc 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -902,6 +902,7 @@ static int io_file_get(struct io_submit_state *state, struct io_kiocb *req,
 static void __io_queue_sqe(struct io_kiocb *req,
 			   const struct io_uring_sqe *sqe,
 			   struct io_comp_state *cs);
+static void io_file_put_work(struct work_struct *work);
 
 static ssize_t io_import_iovec(int rw, struct io_kiocb *req,
 			       struct iovec **iovec, struct iov_iter *iter,
@@ -942,7 +943,7 @@ static void __io_put_req_task(struct io_kiocb *req)
 		put_task_struct(req->task);
 }
 
-static void io_sq_thread_drop_mm(struct io_ring_ctx *ctx)
+static void io_sq_thread_drop_mm(void)
 {
 	struct mm_struct *mm = current->mm;
 
@@ -977,8 +978,6 @@ static inline void req_set_fail_links(struct io_kiocb *req)
 		req->flags |= REQ_F_FAIL_LINK;
 }
 
-static void io_file_put_work(struct work_struct *work);
-
 /*
  * Note: must call io_req_init_async() for the first time you
  * touch any members of io_wq_work.
@@ -6339,7 +6338,7 @@ static int io_sq_thread(void *data)
 			 * adding ourselves to the waitqueue, as the unuse/drop
 			 * may sleep.
 			 */
-			io_sq_thread_drop_mm(ctx);
+			io_sq_thread_drop_mm();
 
 			/*
 			 * We're polling. If we're within the defined idle
@@ -6410,7 +6409,7 @@ static int io_sq_thread(void *data)
 
 	io_run_task_work();
 
-	io_sq_thread_drop_mm(ctx);
+	io_sq_thread_drop_mm();
 	revert_creds(old_cred);
 
 	kthread_parkme();

From 248591f5d257a19c1cba9ab9da3536bfbc2f0149 Mon Sep 17 00:00:00 2001
From: Marco Elver <elver@google.com>
Date: Wed, 24 Jun 2020 13:32:46 +0200
Subject: [PATCH 314/502] kcsan: Make KCSAN compatible with new IRQ state
 tracking

The new IRQ state tracking code does not honor lockdep_off(), and as
such we should again permit tracing by using non-raw functions in
core.c. Update the lockdep_off() comment in report.c, to reflect the
fact there is still a potential risk of deadlock due to using printk()
from scheduler code.

Suggested-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Marco Elver <elver@google.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Ingo Molnar <mingo@kernel.org>
Link: https://lkml.kernel.org/r/20200624113246.GA170324@elver.google.com
---
 kernel/kcsan/core.c   | 5 ++---
 kernel/kcsan/report.c | 9 +++++----
 2 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/kernel/kcsan/core.c b/kernel/kcsan/core.c
index 15f67949d11e..732623c30359 100644
--- a/kernel/kcsan/core.c
+++ b/kernel/kcsan/core.c
@@ -397,8 +397,7 @@ kcsan_setup_watchpoint(const volatile void *ptr, size_t size, int type)
 	}
 
 	if (!kcsan_interrupt_watcher)
-		/* Use raw to avoid lockdep recursion via IRQ flags tracing. */
-		raw_local_irq_save(irq_flags);
+		local_irq_save(irq_flags);
 
 	watchpoint = insert_watchpoint((unsigned long)ptr, size, is_write);
 	if (watchpoint == NULL) {
@@ -539,7 +538,7 @@ kcsan_setup_watchpoint(const volatile void *ptr, size_t size, int type)
 	kcsan_counter_dec(KCSAN_COUNTER_USED_WATCHPOINTS);
 out_unlock:
 	if (!kcsan_interrupt_watcher)
-		raw_local_irq_restore(irq_flags);
+		local_irq_restore(irq_flags);
 out:
 	user_access_restore(ua_flags);
 }
diff --git a/kernel/kcsan/report.c b/kernel/kcsan/report.c
index ac5f8345bae9..6b2fb1a6d8cd 100644
--- a/kernel/kcsan/report.c
+++ b/kernel/kcsan/report.c
@@ -606,10 +606,11 @@ void kcsan_report(const volatile void *ptr, size_t size, int access_type,
 		goto out;
 
 	/*
-	 * With TRACE_IRQFLAGS, lockdep's IRQ trace state becomes corrupted if
-	 * we do not turn off lockdep here; this could happen due to recursion
-	 * into lockdep via KCSAN if we detect a race in utilities used by
-	 * lockdep.
+	 * Because we may generate reports when we're in scheduler code, the use
+	 * of printk() could deadlock. Until such time that all printing code
+	 * called in print_report() is scheduler-safe, accept the risk, and just
+	 * get our message out. As such, also disable lockdep to hide the
+	 * warning, and avoid disabling lockdep for the rest of the kernel.
 	 */
 	lockdep_off();
 

From 48017e5481ce85ba52c4cff082cad5f021c4b413 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Fri, 29 May 2020 22:40:58 +0200
Subject: [PATCH 315/502] sparc64: Fix asm/percpu.h build error

In order to break a header dependency between lockdep and task_struct,
I need per-cpu stuff from lockdep.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Ingo Molnar <mingo@kernel.org>
Acked-by: David S. Miller <davem@davemloft.net>
Link: https://lkml.kernel.org/r/20200623083721.277992771@infradead.org
---
 arch/sparc/include/asm/percpu_64.h  | 2 ++
 arch/sparc/include/asm/trap_block.h | 2 ++
 2 files changed, 4 insertions(+)

diff --git a/arch/sparc/include/asm/percpu_64.h b/arch/sparc/include/asm/percpu_64.h
index 32ef6f05cc56..a8786a4b90b6 100644
--- a/arch/sparc/include/asm/percpu_64.h
+++ b/arch/sparc/include/asm/percpu_64.h
@@ -4,7 +4,9 @@
 
 #include <linux/compiler.h>
 
+#ifndef BUILD_VDSO
 register unsigned long __local_per_cpu_offset asm("g5");
+#endif
 
 #ifdef CONFIG_SMP
 
diff --git a/arch/sparc/include/asm/trap_block.h b/arch/sparc/include/asm/trap_block.h
index 0f6d0c4f6683..ace0d48e837e 100644
--- a/arch/sparc/include/asm/trap_block.h
+++ b/arch/sparc/include/asm/trap_block.h
@@ -2,6 +2,8 @@
 #ifndef _SPARC_TRAP_BLOCK_H
 #define _SPARC_TRAP_BLOCK_H
 
+#include <linux/threads.h>
+
 #include <asm/hypervisor.h>
 #include <asm/asi.h>
 

From 859d069ee1ddd87862e1d6a356a82ed417dbeb67 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Wed, 27 May 2020 15:00:57 +0200
Subject: [PATCH 316/502] lockdep: Prepare for NMI IRQ state tracking

There is no reason not to always, accurately, track IRQ state.

This change also makes IRQ state tracking ignore lockdep_off().

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Ingo Molnar <mingo@kernel.org>
Link: https://lkml.kernel.org/r/20200623083721.155449112@infradead.org
---
 kernel/locking/lockdep.c | 46 ++++++++++++++++++++++++++++++++++++----
 1 file changed, 42 insertions(+), 4 deletions(-)

diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
index 29a8de4c50b9..d595623c4b34 100644
--- a/kernel/locking/lockdep.c
+++ b/kernel/locking/lockdep.c
@@ -395,7 +395,7 @@ void lockdep_init_task(struct task_struct *task)
 
 static __always_inline void lockdep_recursion_finish(void)
 {
-	if (WARN_ON_ONCE(--current->lockdep_recursion))
+	if (WARN_ON_ONCE((--current->lockdep_recursion) & LOCKDEP_RECURSION_MASK))
 		current->lockdep_recursion = 0;
 }
 
@@ -3646,7 +3646,16 @@ static void __trace_hardirqs_on_caller(void)
  */
 void lockdep_hardirqs_on_prepare(unsigned long ip)
 {
-	if (unlikely(!debug_locks || current->lockdep_recursion))
+	if (unlikely(!debug_locks))
+		return;
+
+	/*
+	 * NMIs do not (and cannot) track lock dependencies, nothing to do.
+	 */
+	if (unlikely(in_nmi()))
+		return;
+
+	if (unlikely(current->lockdep_recursion & LOCKDEP_RECURSION_MASK))
 		return;
 
 	if (unlikely(current->hardirqs_enabled)) {
@@ -3692,7 +3701,27 @@ void noinstr lockdep_hardirqs_on(unsigned long ip)
 {
 	struct task_struct *curr = current;
 
-	if (unlikely(!debug_locks || curr->lockdep_recursion))
+	if (unlikely(!debug_locks))
+		return;
+
+	/*
+	 * NMIs can happen in the middle of local_irq_{en,dis}able() where the
+	 * tracking state and hardware state are out of sync.
+	 *
+	 * NMIs must save lockdep_hardirqs_enabled() to restore IRQ state from,
+	 * and not rely on hardware state like normal interrupts.
+	 */
+	if (unlikely(in_nmi())) {
+		/*
+		 * Skip:
+		 *  - recursion check, because NMI can hit lockdep;
+		 *  - hardware state check, because above;
+		 *  - chain_key check, see lockdep_hardirqs_on_prepare().
+		 */
+		goto skip_checks;
+	}
+
+	if (unlikely(current->lockdep_recursion & LOCKDEP_RECURSION_MASK))
 		return;
 
 	if (curr->hardirqs_enabled) {
@@ -3720,6 +3749,7 @@ void noinstr lockdep_hardirqs_on(unsigned long ip)
 	DEBUG_LOCKS_WARN_ON(current->hardirq_chain_key !=
 			    current->curr_chain_key);
 
+skip_checks:
 	/* we'll do an OFF -> ON transition: */
 	curr->hardirqs_enabled = 1;
 	curr->hardirq_enable_ip = ip;
@@ -3735,7 +3765,15 @@ void noinstr lockdep_hardirqs_off(unsigned long ip)
 {
 	struct task_struct *curr = current;
 
-	if (unlikely(!debug_locks || curr->lockdep_recursion))
+	if (unlikely(!debug_locks))
+		return;
+
+	/*
+	 * Matching lockdep_hardirqs_on(), allow NMIs in the middle of lockdep;
+	 * they will restore the software state. This ensures the software
+	 * state is consistent inside NMIs as well.
+	 */
+	if (unlikely(!in_nmi() && (current->lockdep_recursion & LOCKDEP_RECURSION_MASK)))
 		return;
 
 	/*

From d6bdceb6c2276276c0392b926ccd2e5991d5cb9a Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Fri, 29 May 2020 22:41:01 +0200
Subject: [PATCH 317/502] powerpc64: Break asm/percpu.h vs spinlock_types.h
 dependency

In order to use <asm/percpu.h> in lockdep.h, we need to make sure
asm/percpu.h does not itself depend on lockdep.

The below seems to make that so and builds powerpc64-defconfig +
PROVE_LOCKING.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Ingo Molnar <mingo@kernel.org>
https://lkml.kernel.org/r/20200623083721.336906073@infradead.org
---
 arch/powerpc/include/asm/dtl.h         | 52 ++++++++++++++++++++++++++
 arch/powerpc/include/asm/lppaca.h      | 44 ----------------------
 arch/powerpc/include/asm/paca.h        |  2 +-
 arch/powerpc/kernel/time.c             |  2 +
 arch/powerpc/kvm/book3s_hv.c           |  1 +
 arch/powerpc/platforms/pseries/dtl.c   |  1 +
 arch/powerpc/platforms/pseries/lpar.c  |  1 +
 arch/powerpc/platforms/pseries/setup.c |  1 +
 arch/powerpc/platforms/pseries/svm.c   |  1 +
 9 files changed, 60 insertions(+), 45 deletions(-)
 create mode 100644 arch/powerpc/include/asm/dtl.h

diff --git a/arch/powerpc/include/asm/dtl.h b/arch/powerpc/include/asm/dtl.h
new file mode 100644
index 000000000000..1625888f27ef
--- /dev/null
+++ b/arch/powerpc/include/asm/dtl.h
@@ -0,0 +1,52 @@
+#ifndef _ASM_POWERPC_DTL_H
+#define _ASM_POWERPC_DTL_H
+
+#include <asm/lppaca.h>
+#include <linux/spinlock_types.h>
+
+/*
+ * Layout of entries in the hypervisor's dispatch trace log buffer.
+ */
+struct dtl_entry {
+	u8	dispatch_reason;
+	u8	preempt_reason;
+	__be16	processor_id;
+	__be32	enqueue_to_dispatch_time;
+	__be32	ready_to_enqueue_time;
+	__be32	waiting_to_ready_time;
+	__be64	timebase;
+	__be64	fault_addr;
+	__be64	srr0;
+	__be64	srr1;
+};
+
+#define DISPATCH_LOG_BYTES	4096	/* bytes per cpu */
+#define N_DISPATCH_LOG		(DISPATCH_LOG_BYTES / sizeof(struct dtl_entry))
+
+/*
+ * Dispatch trace log event enable mask:
+ *   0x1: voluntary virtual processor waits
+ *   0x2: time-slice preempts
+ *   0x4: virtual partition memory page faults
+ */
+#define DTL_LOG_CEDE		0x1
+#define DTL_LOG_PREEMPT		0x2
+#define DTL_LOG_FAULT		0x4
+#define DTL_LOG_ALL		(DTL_LOG_CEDE | DTL_LOG_PREEMPT | DTL_LOG_FAULT)
+
+extern struct kmem_cache *dtl_cache;
+extern rwlock_t dtl_access_lock;
+
+/*
+ * When CONFIG_VIRT_CPU_ACCOUNTING_NATIVE = y, the cpu accounting code controls
+ * reading from the dispatch trace log.  If other code wants to consume
+ * DTL entries, it can set this pointer to a function that will get
+ * called once for each DTL entry that gets processed.
+ */
+extern void (*dtl_consumer)(struct dtl_entry *entry, u64 index);
+
+extern void register_dtl_buffer(int cpu);
+extern void alloc_dtl_buffers(unsigned long *time_limit);
+extern long hcall_vphn(unsigned long cpu, u64 flags, __be32 *associativity);
+
+#endif /* _ASM_POWERPC_DTL_H */
diff --git a/arch/powerpc/include/asm/lppaca.h b/arch/powerpc/include/asm/lppaca.h
index 3b4b305796ae..c390ec377bae 100644
--- a/arch/powerpc/include/asm/lppaca.h
+++ b/arch/powerpc/include/asm/lppaca.h
@@ -42,7 +42,6 @@
  */
 #include <linux/cache.h>
 #include <linux/threads.h>
-#include <linux/spinlock_types.h>
 #include <asm/types.h>
 #include <asm/mmu.h>
 #include <asm/firmware.h>
@@ -146,49 +145,6 @@ struct slb_shadow {
 	} save_area[SLB_NUM_BOLTED];
 } ____cacheline_aligned;
 
-/*
- * Layout of entries in the hypervisor's dispatch trace log buffer.
- */
-struct dtl_entry {
-	u8	dispatch_reason;
-	u8	preempt_reason;
-	__be16	processor_id;
-	__be32	enqueue_to_dispatch_time;
-	__be32	ready_to_enqueue_time;
-	__be32	waiting_to_ready_time;
-	__be64	timebase;
-	__be64	fault_addr;
-	__be64	srr0;
-	__be64	srr1;
-};
-
-#define DISPATCH_LOG_BYTES	4096	/* bytes per cpu */
-#define N_DISPATCH_LOG		(DISPATCH_LOG_BYTES / sizeof(struct dtl_entry))
-
-/*
- * Dispatch trace log event enable mask:
- *   0x1: voluntary virtual processor waits
- *   0x2: time-slice preempts
- *   0x4: virtual partition memory page faults
- */
-#define DTL_LOG_CEDE		0x1
-#define DTL_LOG_PREEMPT		0x2
-#define DTL_LOG_FAULT		0x4
-#define DTL_LOG_ALL		(DTL_LOG_CEDE | DTL_LOG_PREEMPT | DTL_LOG_FAULT)
-
-extern struct kmem_cache *dtl_cache;
-extern rwlock_t dtl_access_lock;
-
-/*
- * When CONFIG_VIRT_CPU_ACCOUNTING_NATIVE = y, the cpu accounting code controls
- * reading from the dispatch trace log.  If other code wants to consume
- * DTL entries, it can set this pointer to a function that will get
- * called once for each DTL entry that gets processed.
- */
-extern void (*dtl_consumer)(struct dtl_entry *entry, u64 index);
-
-extern void register_dtl_buffer(int cpu);
-extern void alloc_dtl_buffers(unsigned long *time_limit);
 extern long hcall_vphn(unsigned long cpu, u64 flags, __be32 *associativity);
 
 #endif /* CONFIG_PPC_BOOK3S */
diff --git a/arch/powerpc/include/asm/paca.h b/arch/powerpc/include/asm/paca.h
index 45a839a7c6cf..84b2564cf5a4 100644
--- a/arch/powerpc/include/asm/paca.h
+++ b/arch/powerpc/include/asm/paca.h
@@ -29,7 +29,6 @@
 #include <asm/hmi.h>
 #include <asm/cpuidle.h>
 #include <asm/atomic.h>
-#include <asm/rtas-types.h>
 
 #include <asm-generic/mmiowb_types.h>
 
@@ -53,6 +52,7 @@ extern unsigned int debug_smp_processor_id(void); /* from linux/smp.h */
 #define get_slb_shadow()	(get_paca()->slb_shadow_ptr)
 
 struct task_struct;
+struct rtas_args;
 
 /*
  * Defines the layout of the paca.
diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c
index 6fcae436ae51..f85539ebb513 100644
--- a/arch/powerpc/kernel/time.c
+++ b/arch/powerpc/kernel/time.c
@@ -183,6 +183,8 @@ static inline unsigned long read_spurr(unsigned long tb)
 
 #ifdef CONFIG_PPC_SPLPAR
 
+#include <asm/dtl.h>
+
 /*
  * Scan the dispatch trace log and count up the stolen time.
  * Should be called with interrupts disabled.
diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index 6bf66649ab92..ebb04f331ad3 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -74,6 +74,7 @@
 #include <asm/hw_breakpoint.h>
 #include <asm/kvm_book3s_uvmem.h>
 #include <asm/ultravisor.h>
+#include <asm/dtl.h>
 
 #include "book3s.h"
 
diff --git a/arch/powerpc/platforms/pseries/dtl.c b/arch/powerpc/platforms/pseries/dtl.c
index eab8aa293743..982f069e4c31 100644
--- a/arch/powerpc/platforms/pseries/dtl.c
+++ b/arch/powerpc/platforms/pseries/dtl.c
@@ -12,6 +12,7 @@
 #include <asm/smp.h>
 #include <linux/uaccess.h>
 #include <asm/firmware.h>
+#include <asm/dtl.h>
 #include <asm/lppaca.h>
 #include <asm/debugfs.h>
 #include <asm/plpar_wrappers.h>
diff --git a/arch/powerpc/platforms/pseries/lpar.c b/arch/powerpc/platforms/pseries/lpar.c
index fd26f3d21d7b..f71ff2c94efe 100644
--- a/arch/powerpc/platforms/pseries/lpar.c
+++ b/arch/powerpc/platforms/pseries/lpar.c
@@ -40,6 +40,7 @@
 #include <asm/fadump.h>
 #include <asm/asm-prototypes.h>
 #include <asm/debugfs.h>
+#include <asm/dtl.h>
 
 #include "pseries.h"
 
diff --git a/arch/powerpc/platforms/pseries/setup.c b/arch/powerpc/platforms/pseries/setup.c
index 2db8469e475f..27094c872fd6 100644
--- a/arch/powerpc/platforms/pseries/setup.c
+++ b/arch/powerpc/platforms/pseries/setup.c
@@ -70,6 +70,7 @@
 #include <asm/idle.h>
 #include <asm/swiotlb.h>
 #include <asm/svm.h>
+#include <asm/dtl.h>
 
 #include "pseries.h"
 #include "../../../../drivers/pci/pci.h"
diff --git a/arch/powerpc/platforms/pseries/svm.c b/arch/powerpc/platforms/pseries/svm.c
index 40c0637203d5..e6d7a344d9f2 100644
--- a/arch/powerpc/platforms/pseries/svm.c
+++ b/arch/powerpc/platforms/pseries/svm.c
@@ -11,6 +11,7 @@
 #include <asm/svm.h>
 #include <asm/swiotlb.h>
 #include <asm/ultravisor.h>
+#include <asm/dtl.h>
 
 static int __init init_svm(void)
 {

From ba1f2b2eaa2a529dba722507c55ff3d761d325dd Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Wed, 27 May 2020 15:50:29 +0200
Subject: [PATCH 318/502] x86/entry: Fix NMI vs IRQ state tracking

While the nmi_enter() users did
trace_hardirqs_{off_prepare,on_finish}() there was no matching
lockdep_hardirqs_*() calls to complete the picture.

Introduce idtentry_{enter,exit}_nmi() to enable proper IRQ state
tracking across the NMIs.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Ingo Molnar <mingo@kernel.org>
Link: https://lkml.kernel.org/r/20200623083721.216740948@infradead.org
---
 arch/x86/entry/common.c         | 42 +++++++++++++++++++++++++++++----
 arch/x86/include/asm/idtentry.h |  3 +++
 arch/x86/kernel/nmi.c           |  9 ++++---
 arch/x86/kernel/traps.c         | 17 +++++--------
 include/linux/hardirq.h         | 28 +++++++++++++++-------
 5 files changed, 70 insertions(+), 29 deletions(-)

diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c
index 0521546022cb..63c607dd6c52 100644
--- a/arch/x86/entry/common.c
+++ b/arch/x86/entry/common.c
@@ -592,7 +592,7 @@ SYSCALL_DEFINE0(ni_syscall)
  * The return value must be fed into the state argument of
  * idtentry_exit().
  */
-idtentry_state_t noinstr idtentry_enter(struct pt_regs *regs)
+noinstr idtentry_state_t idtentry_enter(struct pt_regs *regs)
 {
 	idtentry_state_t ret = {
 		.exit_rcu = false,
@@ -687,7 +687,7 @@ static void idtentry_exit_cond_resched(struct pt_regs *regs, bool may_sched)
  * Counterpart to idtentry_enter(). The return value of the entry
  * function must be fed into the @state argument.
  */
-void noinstr idtentry_exit(struct pt_regs *regs, idtentry_state_t state)
+noinstr void idtentry_exit(struct pt_regs *regs, idtentry_state_t state)
 {
 	lockdep_assert_irqs_disabled();
 
@@ -731,7 +731,7 @@ void noinstr idtentry_exit(struct pt_regs *regs, idtentry_state_t state)
  * Invokes enter_from_user_mode() to establish the proper context for
  * NOHZ_FULL. Otherwise scheduling on exit would not be possible.
  */
-void noinstr idtentry_enter_user(struct pt_regs *regs)
+noinstr void idtentry_enter_user(struct pt_regs *regs)
 {
 	check_user_regs(regs);
 	enter_from_user_mode();
@@ -749,13 +749,47 @@ void noinstr idtentry_enter_user(struct pt_regs *regs)
  *
  * Counterpart to idtentry_enter_user().
  */
-void noinstr idtentry_exit_user(struct pt_regs *regs)
+noinstr void idtentry_exit_user(struct pt_regs *regs)
 {
 	lockdep_assert_irqs_disabled();
 
 	prepare_exit_to_usermode(regs);
 }
 
+noinstr bool idtentry_enter_nmi(struct pt_regs *regs)
+{
+	bool irq_state = lockdep_hardirqs_enabled(current);
+
+	__nmi_enter();
+	lockdep_hardirqs_off(CALLER_ADDR0);
+	lockdep_hardirq_enter();
+	rcu_nmi_enter();
+
+	instrumentation_begin();
+	trace_hardirqs_off_finish();
+	ftrace_nmi_enter();
+	instrumentation_end();
+
+	return irq_state;
+}
+
+noinstr void idtentry_exit_nmi(struct pt_regs *regs, bool restore)
+{
+	instrumentation_begin();
+	ftrace_nmi_exit();
+	if (restore) {
+		trace_hardirqs_on_prepare();
+		lockdep_hardirqs_on_prepare(CALLER_ADDR0);
+	}
+	instrumentation_end();
+
+	rcu_nmi_exit();
+	lockdep_hardirq_exit();
+	if (restore)
+		lockdep_hardirqs_on(CALLER_ADDR0);
+	__nmi_exit();
+}
+
 #ifdef CONFIG_XEN_PV
 #ifndef CONFIG_PREEMPTION
 /*
diff --git a/arch/x86/include/asm/idtentry.h b/arch/x86/include/asm/idtentry.h
index 7227225cf45d..2b0497486525 100644
--- a/arch/x86/include/asm/idtentry.h
+++ b/arch/x86/include/asm/idtentry.h
@@ -20,6 +20,9 @@ typedef struct idtentry_state {
 idtentry_state_t idtentry_enter(struct pt_regs *regs);
 void idtentry_exit(struct pt_regs *regs, idtentry_state_t state);
 
+bool idtentry_enter_nmi(struct pt_regs *regs);
+void idtentry_exit_nmi(struct pt_regs *regs, bool irq_state);
+
 /**
  * DECLARE_IDTENTRY - Declare functions for simple IDT entry points
  *		      No error code pushed by hardware
diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c
index d7c5e44b26f7..4fc9954a9560 100644
--- a/arch/x86/kernel/nmi.c
+++ b/arch/x86/kernel/nmi.c
@@ -330,7 +330,6 @@ static noinstr void default_do_nmi(struct pt_regs *regs)
 	__this_cpu_write(last_nmi_rip, regs->ip);
 
 	instrumentation_begin();
-	trace_hardirqs_off_finish();
 
 	handled = nmi_handle(NMI_LOCAL, regs);
 	__this_cpu_add(nmi_stats.normal, handled);
@@ -417,8 +416,6 @@ static noinstr void default_do_nmi(struct pt_regs *regs)
 		unknown_nmi_error(reason, regs);
 
 out:
-	if (regs->flags & X86_EFLAGS_IF)
-		trace_hardirqs_on_prepare();
 	instrumentation_end();
 }
 
@@ -478,6 +475,8 @@ static DEFINE_PER_CPU(unsigned long, nmi_dr7);
 
 DEFINE_IDTENTRY_RAW(exc_nmi)
 {
+	bool irq_state;
+
 	if (IS_ENABLED(CONFIG_SMP) && arch_cpu_is_offline(smp_processor_id()))
 		return;
 
@@ -491,14 +490,14 @@ nmi_restart:
 
 	this_cpu_write(nmi_dr7, local_db_save());
 
-	nmi_enter();
+	irq_state = idtentry_enter_nmi(regs);
 
 	inc_irq_stat(__nmi_count);
 
 	if (!ignore_nmis)
 		default_do_nmi(regs);
 
-	nmi_exit();
+	idtentry_exit_nmi(regs, irq_state);
 
 	local_db_restore(this_cpu_read(nmi_dr7));
 
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index 4627f826fb57..cdd73829e637 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -403,7 +403,7 @@ DEFINE_IDTENTRY_DF(exc_double_fault)
 	}
 #endif
 
-	nmi_enter();
+	idtentry_enter_nmi(regs);
 	instrumentation_begin();
 	notify_die(DIE_TRAP, str, regs, error_code, X86_TRAP_DF, SIGSEGV);
 
@@ -649,15 +649,12 @@ DEFINE_IDTENTRY_RAW(exc_int3)
 		instrumentation_end();
 		idtentry_exit_user(regs);
 	} else {
-		nmi_enter();
+		bool irq_state = idtentry_enter_nmi(regs);
 		instrumentation_begin();
-		trace_hardirqs_off_finish();
 		if (!do_int3(regs))
 			die("int3", regs, 0);
-		if (regs->flags & X86_EFLAGS_IF)
-			trace_hardirqs_on_prepare();
 		instrumentation_end();
-		nmi_exit();
+		idtentry_exit_nmi(regs, irq_state);
 	}
 }
 
@@ -865,9 +862,8 @@ out:
 static __always_inline void exc_debug_kernel(struct pt_regs *regs,
 					     unsigned long dr6)
 {
-	nmi_enter();
+	bool irq_state = idtentry_enter_nmi(regs);
 	instrumentation_begin();
-	trace_hardirqs_off_finish();
 
 	/*
 	 * If something gets miswired and we end up here for a user mode
@@ -884,10 +880,8 @@ static __always_inline void exc_debug_kernel(struct pt_regs *regs,
 
 	handle_debug(regs, dr6, false);
 
-	if (regs->flags & X86_EFLAGS_IF)
-		trace_hardirqs_on_prepare();
 	instrumentation_end();
-	nmi_exit();
+	idtentry_exit_nmi(regs, irq_state);
 }
 
 static __always_inline void exc_debug_user(struct pt_regs *regs,
@@ -903,6 +897,7 @@ static __always_inline void exc_debug_user(struct pt_regs *regs,
 	instrumentation_begin();
 
 	handle_debug(regs, dr6, true);
+
 	instrumentation_end();
 	idtentry_exit_user(regs);
 }
diff --git a/include/linux/hardirq.h b/include/linux/hardirq.h
index 03c9fece7d43..754f67ac4326 100644
--- a/include/linux/hardirq.h
+++ b/include/linux/hardirq.h
@@ -111,32 +111,42 @@ extern void rcu_nmi_exit(void);
 /*
  * nmi_enter() can nest up to 15 times; see NMI_BITS.
  */
-#define nmi_enter()						\
+#define __nmi_enter()						\
 	do {							\
+		lockdep_off();					\
 		arch_nmi_enter();				\
 		printk_nmi_enter();				\
-		lockdep_off();					\
 		BUG_ON(in_nmi() == NMI_MASK);			\
 		__preempt_count_add(NMI_OFFSET + HARDIRQ_OFFSET);	\
-		rcu_nmi_enter();				\
+	} while (0)
+
+#define nmi_enter()						\
+	do {							\
+		__nmi_enter();					\
 		lockdep_hardirq_enter();			\
+		rcu_nmi_enter();				\
 		instrumentation_begin();			\
 		ftrace_nmi_enter();				\
 		instrumentation_end();				\
 	} while (0)
 
+#define __nmi_exit()						\
+	do {							\
+		BUG_ON(!in_nmi());				\
+		__preempt_count_sub(NMI_OFFSET + HARDIRQ_OFFSET);	\
+		printk_nmi_exit();				\
+		arch_nmi_exit();				\
+		lockdep_on();					\
+	} while (0)
+
 #define nmi_exit()						\
 	do {							\
 		instrumentation_begin();			\
 		ftrace_nmi_exit();				\
 		instrumentation_end();				\
-		lockdep_hardirq_exit();				\
 		rcu_nmi_exit();					\
-		BUG_ON(!in_nmi());				\
-		__preempt_count_sub(NMI_OFFSET + HARDIRQ_OFFSET);	\
-		lockdep_on();					\
-		printk_nmi_exit();				\
-		arch_nmi_exit();				\
+		lockdep_hardirq_exit();				\
+		__nmi_exit();					\
 	} while (0)
 
 #endif /* LINUX_HARDIRQ_H */

From 28e5bfd81c8de77504703adf24ceff9301e3c7be Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Fri, 29 May 2020 22:41:05 +0200
Subject: [PATCH 319/502] s390: Break cyclic percpu include

In order to use <asm/percpu.h> in irqflags.h, we need to make sure
asm/percpu.h does not itself depend on irqflags.h

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Ingo Molnar <mingo@kernel.org>
Link: https://lkml.kernel.org/r/20200623083721.396143816@infradead.org
---
 arch/s390/include/asm/smp.h         | 1 +
 arch/s390/include/asm/thread_info.h | 1 -
 2 files changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/s390/include/asm/smp.h b/arch/s390/include/asm/smp.h
index 7326f110d48c..f48a43b63d9e 100644
--- a/arch/s390/include/asm/smp.h
+++ b/arch/s390/include/asm/smp.h
@@ -10,6 +10,7 @@
 
 #include <asm/sigp.h>
 #include <asm/lowcore.h>
+#include <asm/processor.h>
 
 #define raw_smp_processor_id()	(S390_lowcore.cpu_nr)
 
diff --git a/arch/s390/include/asm/thread_info.h b/arch/s390/include/asm/thread_info.h
index e582fbe59e20..13a04fcf7762 100644
--- a/arch/s390/include/asm/thread_info.h
+++ b/arch/s390/include/asm/thread_info.h
@@ -24,7 +24,6 @@
 #ifndef __ASSEMBLY__
 #include <asm/lowcore.h>
 #include <asm/page.h>
-#include <asm/processor.h>
 
 #define STACK_INIT_OFFSET \
 	(THREAD_SIZE - STACK_FRAME_OVERHEAD - sizeof(struct pt_regs))

From a6342915881a687b07847b7c57628de07a256525 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Mon, 22 Jun 2020 17:21:58 +0200
Subject: [PATCH 320/502] arm: Break cyclic percpu include

In order to use <asm/percpu.h> in irqflags.h, we need to make sure
asm/percpu.h does not itself depend on irqflags.h.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Ingo Molnar <mingo@kernel.org>
Acked-by: Will Deacon <will@kernel.org>
Link: https://lkml.kernel.org/r/20200623083721.454517573@infradead.org
---
 arch/arm/include/asm/percpu.h      | 2 ++
 arch/arm/include/asm/thread_info.h | 5 -----
 2 files changed, 2 insertions(+), 5 deletions(-)

diff --git a/arch/arm/include/asm/percpu.h b/arch/arm/include/asm/percpu.h
index f44f448537f2..e2fcb3cfd3de 100644
--- a/arch/arm/include/asm/percpu.h
+++ b/arch/arm/include/asm/percpu.h
@@ -5,6 +5,8 @@
 #ifndef _ASM_ARM_PERCPU_H_
 #define _ASM_ARM_PERCPU_H_
 
+register unsigned long current_stack_pointer asm ("sp");
+
 /*
  * Same as asm-generic/percpu.h, except that we store the per cpu offset
  * in the TPIDRPRW. TPIDRPRW only exists on V6K and V7
diff --git a/arch/arm/include/asm/thread_info.h b/arch/arm/include/asm/thread_info.h
index 3609a6980c34..536b6b979f63 100644
--- a/arch/arm/include/asm/thread_info.h
+++ b/arch/arm/include/asm/thread_info.h
@@ -75,11 +75,6 @@ struct thread_info {
 	.addr_limit	= KERNEL_DS,					\
 }
 
-/*
- * how to get the current stack pointer in C
- */
-register unsigned long current_stack_pointer asm ("sp");
-
 /*
  * how to get the thread information struct from C
  */

From a21ee6055c30ce68c4e201c6496f0ed2a1936230 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Mon, 25 May 2020 12:22:41 +0200
Subject: [PATCH 321/502] lockdep: Change hardirq{s_enabled,_context} to
 per-cpu variables

Currently all IRQ-tracking state is in task_struct, this means that
task_struct needs to be defined before we use it.

Especially for lockdep_assert_irq*() this can lead to header-hell.

Move the hardirq state into per-cpu variables to avoid the task_struct
dependency.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Ingo Molnar <mingo@kernel.org>
Link: https://lkml.kernel.org/r/20200623083721.512673481@infradead.org
---
 include/linux/irqflags.h | 19 ++++++++++++-------
 include/linux/lockdep.h  | 34 ++++++++++++++++++----------------
 include/linux/sched.h    |  2 --
 kernel/fork.c            |  4 +---
 kernel/locking/lockdep.c | 30 +++++++++++++++---------------
 kernel/softirq.c         |  6 ++++++
 6 files changed, 52 insertions(+), 43 deletions(-)

diff --git a/include/linux/irqflags.h b/include/linux/irqflags.h
index 6384d2813ded..255444fe4609 100644
--- a/include/linux/irqflags.h
+++ b/include/linux/irqflags.h
@@ -14,6 +14,7 @@
 
 #include <linux/typecheck.h>
 #include <asm/irqflags.h>
+#include <asm/percpu.h>
 
 /* Currently lockdep_softirqs_on/off is used only by lockdep */
 #ifdef CONFIG_PROVE_LOCKING
@@ -31,18 +32,22 @@
 #endif
 
 #ifdef CONFIG_TRACE_IRQFLAGS
+
+DECLARE_PER_CPU(int, hardirqs_enabled);
+DECLARE_PER_CPU(int, hardirq_context);
+
   extern void trace_hardirqs_on_prepare(void);
   extern void trace_hardirqs_off_finish(void);
   extern void trace_hardirqs_on(void);
   extern void trace_hardirqs_off(void);
-# define lockdep_hardirq_context(p)	((p)->hardirq_context)
+# define lockdep_hardirq_context(p)	(this_cpu_read(hardirq_context))
 # define lockdep_softirq_context(p)	((p)->softirq_context)
-# define lockdep_hardirqs_enabled(p)	((p)->hardirqs_enabled)
+# define lockdep_hardirqs_enabled(p)	(this_cpu_read(hardirqs_enabled))
 # define lockdep_softirqs_enabled(p)	((p)->softirqs_enabled)
-# define lockdep_hardirq_enter()		\
-do {						\
-	if (!current->hardirq_context++)	\
-		current->hardirq_threaded = 0;	\
+# define lockdep_hardirq_enter()			\
+do {							\
+	if (this_cpu_inc_return(hardirq_context) == 1)	\
+		current->hardirq_threaded = 0;		\
 } while (0)
 # define lockdep_hardirq_threaded()		\
 do {						\
@@ -50,7 +55,7 @@ do {						\
 } while (0)
 # define lockdep_hardirq_exit()			\
 do {						\
-	current->hardirq_context--;		\
+	this_cpu_dec(hardirq_context);		\
 } while (0)
 # define lockdep_softirq_enter()		\
 do {						\
diff --git a/include/linux/lockdep.h b/include/linux/lockdep.h
index 3b73cf84f77d..be6cb17a8879 100644
--- a/include/linux/lockdep.h
+++ b/include/linux/lockdep.h
@@ -11,6 +11,7 @@
 #define __LINUX_LOCKDEP_H
 
 #include <linux/lockdep_types.h>
+#include <asm/percpu.h>
 
 struct task_struct;
 
@@ -529,28 +530,29 @@ do {									\
 	lock_release(&(lock)->dep_map, _THIS_IP_);			\
 } while (0)
 
-#define lockdep_assert_irqs_enabled()	do {				\
-		WARN_ONCE(debug_locks && !current->lockdep_recursion &&	\
-			  !current->hardirqs_enabled,			\
-			  "IRQs not enabled as expected\n");		\
-	} while (0)
+DECLARE_PER_CPU(int, hardirqs_enabled);
+DECLARE_PER_CPU(int, hardirq_context);
 
-#define lockdep_assert_irqs_disabled()	do {				\
-		WARN_ONCE(debug_locks && !current->lockdep_recursion &&	\
-			  current->hardirqs_enabled,			\
-			  "IRQs not disabled as expected\n");		\
-	} while (0)
+#define lockdep_assert_irqs_enabled()					\
+do {									\
+	WARN_ON_ONCE(debug_locks && !this_cpu_read(hardirqs_enabled));	\
+} while (0)
 
-#define lockdep_assert_in_irq() do {					\
-		WARN_ONCE(debug_locks && !current->lockdep_recursion &&	\
-			  !current->hardirq_context,			\
-			  "Not in hardirq as expected\n");		\
-	} while (0)
+#define lockdep_assert_irqs_disabled()					\
+do {									\
+	WARN_ON_ONCE(debug_locks && this_cpu_read(hardirqs_enabled));	\
+} while (0)
+
+#define lockdep_assert_in_irq()						\
+do {									\
+	WARN_ON_ONCE(debug_locks && !this_cpu_read(hardirq_context));	\
+} while (0)
 
 #else
 # define might_lock(lock) do { } while (0)
 # define might_lock_read(lock) do { } while (0)
 # define might_lock_nested(lock, subclass) do { } while (0)
+
 # define lockdep_assert_irqs_enabled() do { } while (0)
 # define lockdep_assert_irqs_disabled() do { } while (0)
 # define lockdep_assert_in_irq() do { } while (0)
@@ -560,7 +562,7 @@ do {									\
 
 # define lockdep_assert_RT_in_threaded_ctx() do {			\
 		WARN_ONCE(debug_locks && !current->lockdep_recursion &&	\
-			  current->hardirq_context &&			\
+			  lockdep_hardirq_context(current) &&		\
 			  !(current->hardirq_threaded || current->irq_config),	\
 			  "Not in threaded context on PREEMPT_RT as expected\n");	\
 } while (0)
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 692e327d7455..3903a9500926 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -990,8 +990,6 @@ struct task_struct {
 	unsigned long			hardirq_disable_ip;
 	unsigned int			hardirq_enable_event;
 	unsigned int			hardirq_disable_event;
-	int				hardirqs_enabled;
-	int				hardirq_context;
 	u64				hardirq_chain_key;
 	unsigned long			softirq_disable_ip;
 	unsigned long			softirq_enable_ip;
diff --git a/kernel/fork.c b/kernel/fork.c
index efc5493203ae..70d9d0a4de2a 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1954,8 +1954,8 @@ static __latent_entropy struct task_struct *copy_process(
 
 	rt_mutex_init_task(p);
 
+	lockdep_assert_irqs_enabled();
 #ifdef CONFIG_PROVE_LOCKING
-	DEBUG_LOCKS_WARN_ON(!p->hardirqs_enabled);
 	DEBUG_LOCKS_WARN_ON(!p->softirqs_enabled);
 #endif
 	retval = -EAGAIN;
@@ -2036,7 +2036,6 @@ static __latent_entropy struct task_struct *copy_process(
 #endif
 #ifdef CONFIG_TRACE_IRQFLAGS
 	p->irq_events = 0;
-	p->hardirqs_enabled = 0;
 	p->hardirq_enable_ip = 0;
 	p->hardirq_enable_event = 0;
 	p->hardirq_disable_ip = _THIS_IP_;
@@ -2046,7 +2045,6 @@ static __latent_entropy struct task_struct *copy_process(
 	p->softirq_enable_event = 0;
 	p->softirq_disable_ip = 0;
 	p->softirq_disable_event = 0;
-	p->hardirq_context = 0;
 	p->softirq_context = 0;
 #endif
 
diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
index d595623c4b34..ab4ffbe0e9e9 100644
--- a/kernel/locking/lockdep.c
+++ b/kernel/locking/lockdep.c
@@ -2062,9 +2062,9 @@ print_bad_irq_dependency(struct task_struct *curr,
 	pr_warn("-----------------------------------------------------\n");
 	pr_warn("%s/%d [HC%u[%lu]:SC%u[%lu]:HE%u:SE%u] is trying to acquire:\n",
 		curr->comm, task_pid_nr(curr),
-		curr->hardirq_context, hardirq_count() >> HARDIRQ_SHIFT,
+		lockdep_hardirq_context(curr), hardirq_count() >> HARDIRQ_SHIFT,
 		curr->softirq_context, softirq_count() >> SOFTIRQ_SHIFT,
-		curr->hardirqs_enabled,
+		lockdep_hardirqs_enabled(curr),
 		curr->softirqs_enabled);
 	print_lock(next);
 
@@ -3658,7 +3658,7 @@ void lockdep_hardirqs_on_prepare(unsigned long ip)
 	if (unlikely(current->lockdep_recursion & LOCKDEP_RECURSION_MASK))
 		return;
 
-	if (unlikely(current->hardirqs_enabled)) {
+	if (unlikely(lockdep_hardirqs_enabled(current))) {
 		/*
 		 * Neither irq nor preemption are disabled here
 		 * so this is racy by nature but losing one hit
@@ -3686,7 +3686,7 @@ void lockdep_hardirqs_on_prepare(unsigned long ip)
 	 * Can't allow enabling interrupts while in an interrupt handler,
 	 * that's general bad form and such. Recursion, limited stack etc..
 	 */
-	if (DEBUG_LOCKS_WARN_ON(current->hardirq_context))
+	if (DEBUG_LOCKS_WARN_ON(lockdep_hardirq_context(current)))
 		return;
 
 	current->hardirq_chain_key = current->curr_chain_key;
@@ -3724,7 +3724,7 @@ void noinstr lockdep_hardirqs_on(unsigned long ip)
 	if (unlikely(current->lockdep_recursion & LOCKDEP_RECURSION_MASK))
 		return;
 
-	if (curr->hardirqs_enabled) {
+	if (lockdep_hardirqs_enabled(curr)) {
 		/*
 		 * Neither irq nor preemption are disabled here
 		 * so this is racy by nature but losing one hit
@@ -3751,7 +3751,7 @@ void noinstr lockdep_hardirqs_on(unsigned long ip)
 
 skip_checks:
 	/* we'll do an OFF -> ON transition: */
-	curr->hardirqs_enabled = 1;
+	this_cpu_write(hardirqs_enabled, 1);
 	curr->hardirq_enable_ip = ip;
 	curr->hardirq_enable_event = ++curr->irq_events;
 	debug_atomic_inc(hardirqs_on_events);
@@ -3783,11 +3783,11 @@ void noinstr lockdep_hardirqs_off(unsigned long ip)
 	if (DEBUG_LOCKS_WARN_ON(!irqs_disabled()))
 		return;
 
-	if (curr->hardirqs_enabled) {
+	if (lockdep_hardirqs_enabled(curr)) {
 		/*
 		 * We have done an ON -> OFF transition:
 		 */
-		curr->hardirqs_enabled = 0;
+		this_cpu_write(hardirqs_enabled, 0);
 		curr->hardirq_disable_ip = ip;
 		curr->hardirq_disable_event = ++curr->irq_events;
 		debug_atomic_inc(hardirqs_off_events);
@@ -3832,7 +3832,7 @@ void lockdep_softirqs_on(unsigned long ip)
 	 * usage bit for all held locks, if hardirqs are
 	 * enabled too:
 	 */
-	if (curr->hardirqs_enabled)
+	if (lockdep_hardirqs_enabled(curr))
 		mark_held_locks(curr, LOCK_ENABLED_SOFTIRQ);
 	lockdep_recursion_finish();
 }
@@ -3881,7 +3881,7 @@ mark_usage(struct task_struct *curr, struct held_lock *hlock, int check)
 	 */
 	if (!hlock->trylock) {
 		if (hlock->read) {
-			if (curr->hardirq_context)
+			if (lockdep_hardirq_context(curr))
 				if (!mark_lock(curr, hlock,
 						LOCK_USED_IN_HARDIRQ_READ))
 					return 0;
@@ -3890,7 +3890,7 @@ mark_usage(struct task_struct *curr, struct held_lock *hlock, int check)
 						LOCK_USED_IN_SOFTIRQ_READ))
 					return 0;
 		} else {
-			if (curr->hardirq_context)
+			if (lockdep_hardirq_context(curr))
 				if (!mark_lock(curr, hlock, LOCK_USED_IN_HARDIRQ))
 					return 0;
 			if (curr->softirq_context)
@@ -3928,7 +3928,7 @@ lock_used:
 
 static inline unsigned int task_irq_context(struct task_struct *task)
 {
-	return LOCK_CHAIN_HARDIRQ_CONTEXT * !!task->hardirq_context +
+	return LOCK_CHAIN_HARDIRQ_CONTEXT * !!lockdep_hardirq_context(task) +
 	       LOCK_CHAIN_SOFTIRQ_CONTEXT * !!task->softirq_context;
 }
 
@@ -4021,7 +4021,7 @@ static inline short task_wait_context(struct task_struct *curr)
 	 * Set appropriate wait type for the context; for IRQs we have to take
 	 * into account force_irqthread as that is implied by PREEMPT_RT.
 	 */
-	if (curr->hardirq_context) {
+	if (lockdep_hardirq_context(curr)) {
 		/*
 		 * Check if force_irqthreads will run us threaded.
 		 */
@@ -4864,11 +4864,11 @@ static void check_flags(unsigned long flags)
 		return;
 
 	if (irqs_disabled_flags(flags)) {
-		if (DEBUG_LOCKS_WARN_ON(current->hardirqs_enabled)) {
+		if (DEBUG_LOCKS_WARN_ON(lockdep_hardirqs_enabled(current))) {
 			printk("possible reason: unannotated irqs-off.\n");
 		}
 	} else {
-		if (DEBUG_LOCKS_WARN_ON(!current->hardirqs_enabled)) {
+		if (DEBUG_LOCKS_WARN_ON(!lockdep_hardirqs_enabled(current))) {
 			printk("possible reason: unannotated irqs-on.\n");
 		}
 	}
diff --git a/kernel/softirq.c b/kernel/softirq.c
index c4201b7f42b1..342c53feaa7a 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -107,6 +107,12 @@ static bool ksoftirqd_running(unsigned long pending)
  * where hardirqs are disabled legitimately:
  */
 #ifdef CONFIG_TRACE_IRQFLAGS
+
+DEFINE_PER_CPU(int, hardirqs_enabled);
+DEFINE_PER_CPU(int, hardirq_context);
+EXPORT_PER_CPU_SYMBOL_GPL(hardirqs_enabled);
+EXPORT_PER_CPU_SYMBOL_GPL(hardirq_context);
+
 void __local_bh_disable_ip(unsigned long ip, unsigned int cnt)
 {
 	unsigned long flags;

From f9ad4a5f3f20bee022b1bdde94e5ece6dc0b0edc Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Wed, 27 May 2020 13:03:26 +0200
Subject: [PATCH 322/502] lockdep: Remove lockdep_hardirq{s_enabled,_context}()
 argument

Now that the macros use per-cpu data, we no longer need the argument.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Ingo Molnar <mingo@kernel.org>
Link: https://lkml.kernel.org/r/20200623083721.571835311@infradead.org
---
 arch/x86/entry/common.c        |  2 +-
 include/linux/irqflags.h       |  8 ++++----
 include/linux/lockdep.h        |  2 +-
 kernel/locking/lockdep.c       | 30 +++++++++++++++---------------
 kernel/softirq.c               |  2 +-
 tools/include/linux/irqflags.h |  4 ++--
 6 files changed, 24 insertions(+), 24 deletions(-)

diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c
index 63c607dd6c52..4ea640363f5d 100644
--- a/arch/x86/entry/common.c
+++ b/arch/x86/entry/common.c
@@ -758,7 +758,7 @@ noinstr void idtentry_exit_user(struct pt_regs *regs)
 
 noinstr bool idtentry_enter_nmi(struct pt_regs *regs)
 {
-	bool irq_state = lockdep_hardirqs_enabled(current);
+	bool irq_state = lockdep_hardirqs_enabled();
 
 	__nmi_enter();
 	lockdep_hardirqs_off(CALLER_ADDR0);
diff --git a/include/linux/irqflags.h b/include/linux/irqflags.h
index 255444fe4609..5811ee8a5cd8 100644
--- a/include/linux/irqflags.h
+++ b/include/linux/irqflags.h
@@ -40,9 +40,9 @@ DECLARE_PER_CPU(int, hardirq_context);
   extern void trace_hardirqs_off_finish(void);
   extern void trace_hardirqs_on(void);
   extern void trace_hardirqs_off(void);
-# define lockdep_hardirq_context(p)	(this_cpu_read(hardirq_context))
+# define lockdep_hardirq_context()	(this_cpu_read(hardirq_context))
 # define lockdep_softirq_context(p)	((p)->softirq_context)
-# define lockdep_hardirqs_enabled(p)	(this_cpu_read(hardirqs_enabled))
+# define lockdep_hardirqs_enabled()	(this_cpu_read(hardirqs_enabled))
 # define lockdep_softirqs_enabled(p)	((p)->softirqs_enabled)
 # define lockdep_hardirq_enter()			\
 do {							\
@@ -109,9 +109,9 @@ do {						\
 # define trace_hardirqs_off_finish()		do { } while (0)
 # define trace_hardirqs_on()		do { } while (0)
 # define trace_hardirqs_off()		do { } while (0)
-# define lockdep_hardirq_context(p)	0
+# define lockdep_hardirq_context()	0
 # define lockdep_softirq_context(p)	0
-# define lockdep_hardirqs_enabled(p)	0
+# define lockdep_hardirqs_enabled()	0
 # define lockdep_softirqs_enabled(p)	0
 # define lockdep_hardirq_enter()	do { } while (0)
 # define lockdep_hardirq_threaded()	do { } while (0)
diff --git a/include/linux/lockdep.h b/include/linux/lockdep.h
index be6cb17a8879..fd04b9e96091 100644
--- a/include/linux/lockdep.h
+++ b/include/linux/lockdep.h
@@ -562,7 +562,7 @@ do {									\
 
 # define lockdep_assert_RT_in_threaded_ctx() do {			\
 		WARN_ONCE(debug_locks && !current->lockdep_recursion &&	\
-			  lockdep_hardirq_context(current) &&		\
+			  lockdep_hardirq_context() &&			\
 			  !(current->hardirq_threaded || current->irq_config),	\
 			  "Not in threaded context on PREEMPT_RT as expected\n");	\
 } while (0)
diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
index ab4ffbe0e9e9..c9ea05edce25 100644
--- a/kernel/locking/lockdep.c
+++ b/kernel/locking/lockdep.c
@@ -2062,9 +2062,9 @@ print_bad_irq_dependency(struct task_struct *curr,
 	pr_warn("-----------------------------------------------------\n");
 	pr_warn("%s/%d [HC%u[%lu]:SC%u[%lu]:HE%u:SE%u] is trying to acquire:\n",
 		curr->comm, task_pid_nr(curr),
-		lockdep_hardirq_context(curr), hardirq_count() >> HARDIRQ_SHIFT,
+		lockdep_hardirq_context(), hardirq_count() >> HARDIRQ_SHIFT,
 		curr->softirq_context, softirq_count() >> SOFTIRQ_SHIFT,
-		lockdep_hardirqs_enabled(curr),
+		lockdep_hardirqs_enabled(),
 		curr->softirqs_enabled);
 	print_lock(next);
 
@@ -3331,9 +3331,9 @@ print_usage_bug(struct task_struct *curr, struct held_lock *this,
 
 	pr_warn("%s/%d [HC%u[%lu]:SC%u[%lu]:HE%u:SE%u] takes:\n",
 		curr->comm, task_pid_nr(curr),
-		lockdep_hardirq_context(curr), hardirq_count() >> HARDIRQ_SHIFT,
+		lockdep_hardirq_context(), hardirq_count() >> HARDIRQ_SHIFT,
 		lockdep_softirq_context(curr), softirq_count() >> SOFTIRQ_SHIFT,
-		lockdep_hardirqs_enabled(curr),
+		lockdep_hardirqs_enabled(),
 		lockdep_softirqs_enabled(curr));
 	print_lock(this);
 
@@ -3658,7 +3658,7 @@ void lockdep_hardirqs_on_prepare(unsigned long ip)
 	if (unlikely(current->lockdep_recursion & LOCKDEP_RECURSION_MASK))
 		return;
 
-	if (unlikely(lockdep_hardirqs_enabled(current))) {
+	if (unlikely(lockdep_hardirqs_enabled())) {
 		/*
 		 * Neither irq nor preemption are disabled here
 		 * so this is racy by nature but losing one hit
@@ -3686,7 +3686,7 @@ void lockdep_hardirqs_on_prepare(unsigned long ip)
 	 * Can't allow enabling interrupts while in an interrupt handler,
 	 * that's general bad form and such. Recursion, limited stack etc..
 	 */
-	if (DEBUG_LOCKS_WARN_ON(lockdep_hardirq_context(current)))
+	if (DEBUG_LOCKS_WARN_ON(lockdep_hardirq_context()))
 		return;
 
 	current->hardirq_chain_key = current->curr_chain_key;
@@ -3724,7 +3724,7 @@ void noinstr lockdep_hardirqs_on(unsigned long ip)
 	if (unlikely(current->lockdep_recursion & LOCKDEP_RECURSION_MASK))
 		return;
 
-	if (lockdep_hardirqs_enabled(curr)) {
+	if (lockdep_hardirqs_enabled()) {
 		/*
 		 * Neither irq nor preemption are disabled here
 		 * so this is racy by nature but losing one hit
@@ -3783,7 +3783,7 @@ void noinstr lockdep_hardirqs_off(unsigned long ip)
 	if (DEBUG_LOCKS_WARN_ON(!irqs_disabled()))
 		return;
 
-	if (lockdep_hardirqs_enabled(curr)) {
+	if (lockdep_hardirqs_enabled()) {
 		/*
 		 * We have done an ON -> OFF transition:
 		 */
@@ -3832,7 +3832,7 @@ void lockdep_softirqs_on(unsigned long ip)
 	 * usage bit for all held locks, if hardirqs are
 	 * enabled too:
 	 */
-	if (lockdep_hardirqs_enabled(curr))
+	if (lockdep_hardirqs_enabled())
 		mark_held_locks(curr, LOCK_ENABLED_SOFTIRQ);
 	lockdep_recursion_finish();
 }
@@ -3881,7 +3881,7 @@ mark_usage(struct task_struct *curr, struct held_lock *hlock, int check)
 	 */
 	if (!hlock->trylock) {
 		if (hlock->read) {
-			if (lockdep_hardirq_context(curr))
+			if (lockdep_hardirq_context())
 				if (!mark_lock(curr, hlock,
 						LOCK_USED_IN_HARDIRQ_READ))
 					return 0;
@@ -3890,7 +3890,7 @@ mark_usage(struct task_struct *curr, struct held_lock *hlock, int check)
 						LOCK_USED_IN_SOFTIRQ_READ))
 					return 0;
 		} else {
-			if (lockdep_hardirq_context(curr))
+			if (lockdep_hardirq_context())
 				if (!mark_lock(curr, hlock, LOCK_USED_IN_HARDIRQ))
 					return 0;
 			if (curr->softirq_context)
@@ -3928,7 +3928,7 @@ lock_used:
 
 static inline unsigned int task_irq_context(struct task_struct *task)
 {
-	return LOCK_CHAIN_HARDIRQ_CONTEXT * !!lockdep_hardirq_context(task) +
+	return LOCK_CHAIN_HARDIRQ_CONTEXT * !!lockdep_hardirq_context() +
 	       LOCK_CHAIN_SOFTIRQ_CONTEXT * !!task->softirq_context;
 }
 
@@ -4021,7 +4021,7 @@ static inline short task_wait_context(struct task_struct *curr)
 	 * Set appropriate wait type for the context; for IRQs we have to take
 	 * into account force_irqthread as that is implied by PREEMPT_RT.
 	 */
-	if (lockdep_hardirq_context(curr)) {
+	if (lockdep_hardirq_context()) {
 		/*
 		 * Check if force_irqthreads will run us threaded.
 		 */
@@ -4864,11 +4864,11 @@ static void check_flags(unsigned long flags)
 		return;
 
 	if (irqs_disabled_flags(flags)) {
-		if (DEBUG_LOCKS_WARN_ON(lockdep_hardirqs_enabled(current))) {
+		if (DEBUG_LOCKS_WARN_ON(lockdep_hardirqs_enabled())) {
 			printk("possible reason: unannotated irqs-off.\n");
 		}
 	} else {
-		if (DEBUG_LOCKS_WARN_ON(!lockdep_hardirqs_enabled(current))) {
+		if (DEBUG_LOCKS_WARN_ON(!lockdep_hardirqs_enabled())) {
 			printk("possible reason: unannotated irqs-on.\n");
 		}
 	}
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 342c53feaa7a..5e9aaa648a74 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -230,7 +230,7 @@ static inline bool lockdep_softirq_start(void)
 {
 	bool in_hardirq = false;
 
-	if (lockdep_hardirq_context(current)) {
+	if (lockdep_hardirq_context()) {
 		in_hardirq = true;
 		lockdep_hardirq_exit();
 	}
diff --git a/tools/include/linux/irqflags.h b/tools/include/linux/irqflags.h
index 67e01bbadbfe..501262aee8ff 100644
--- a/tools/include/linux/irqflags.h
+++ b/tools/include/linux/irqflags.h
@@ -2,9 +2,9 @@
 #ifndef _LIBLOCKDEP_LINUX_TRACE_IRQFLAGS_H_
 #define _LIBLOCKDEP_LINUX_TRACE_IRQFLAGS_H_
 
-# define lockdep_hardirq_context(p)	0
+# define lockdep_hardirq_context()	0
 # define lockdep_softirq_context(p)	0
-# define lockdep_hardirqs_enabled(p)	0
+# define lockdep_hardirqs_enabled()	0
 # define lockdep_softirqs_enabled(p)	0
 # define lockdep_hardirq_enter()	do { } while (0)
 # define lockdep_hardirq_exit()		do { } while (0)

From 776499058167d9f41c8eb468e21fe2d241c0b8e6 Mon Sep 17 00:00:00 2001
From: David Hildenbrand <david@redhat.com>
Date: Wed, 1 Jul 2020 16:18:29 +0200
Subject: [PATCH 323/502] mm/memblock: expose only miminal interface to
 add/walk physmem

"physmem" in the memblock allocator is somewhat weird: it's not actually
used for allocation, it's simply information collected during boot, which
describes the unmodified physical memory map at boot time, without any
standby/hotplugged memory. It's only used on s390 and is currently the
only reason s390 keeps using CONFIG_ARCH_KEEP_MEMBLOCK.

Physmem isn't numa aware and current users don't specify any flags. Let's
hide it from the user, exposing only for_each_physmem(), and simplify. The
interface for physmem is now really minimalistic:
- memblock_physmem_add() to add ranges
- for_each_physmem() / __next_physmem_range() to walk physmem ranges

Don't place it into an __init section and don't discard it without
CONFIG_ARCH_KEEP_MEMBLOCK. As we're reusing __next_mem_range(), remove
the __meminit notifier to avoid section mismatch warnings once
CONFIG_ARCH_KEEP_MEMBLOCK is no longer used with
CONFIG_HAVE_MEMBLOCK_PHYS_MAP.

While fixing up the documentation, sneak in some related cleanups. We can
stop setting CONFIG_ARCH_KEEP_MEMBLOCK for s390 next.

Cc: Vasily Gorbik <gor@linux.ibm.com>
Cc: Christian Borntraeger <borntraeger@de.ibm.com>
Cc: Mike Rapoport <rppt@linux.ibm.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: David Hildenbrand <david@redhat.com>
Reviewed-by: Mike Rapoport <rppt@linux.ibm.com>
Message-Id: <20200701141830.18749-2-david@redhat.com>
Signed-off-by: Heiko Carstens <hca@linux.ibm.com>
---
 arch/s390/kernel/crash_dump.c |  6 ++--
 include/linux/memblock.h      | 28 ++++++++++++++---
 mm/memblock.c                 | 57 ++++++++++++++++++-----------------
 3 files changed, 55 insertions(+), 36 deletions(-)

diff --git a/arch/s390/kernel/crash_dump.c b/arch/s390/kernel/crash_dump.c
index f96a5857bbfd..c42ce348103c 100644
--- a/arch/s390/kernel/crash_dump.c
+++ b/arch/s390/kernel/crash_dump.c
@@ -549,8 +549,7 @@ static int get_mem_chunk_cnt(void)
 	int cnt = 0;
 	u64 idx;
 
-	for_each_mem_range(idx, &memblock.physmem, &oldmem_type, NUMA_NO_NODE,
-			   MEMBLOCK_NONE, NULL, NULL, NULL)
+	for_each_physmem_range(idx, &oldmem_type, NULL, NULL)
 		cnt++;
 	return cnt;
 }
@@ -563,8 +562,7 @@ static void loads_init(Elf64_Phdr *phdr, u64 loads_offset)
 	phys_addr_t start, end;
 	u64 idx;
 
-	for_each_mem_range(idx, &memblock.physmem, &oldmem_type, NUMA_NO_NODE,
-			   MEMBLOCK_NONE, &start, &end, NULL) {
+	for_each_physmem_range(idx, &oldmem_type, &start, &end) {
 		phdr->p_filesz = end - start;
 		phdr->p_type = PT_LOAD;
 		phdr->p_offset = start;
diff --git a/include/linux/memblock.h b/include/linux/memblock.h
index 017fae833d4a..9d925db0d355 100644
--- a/include/linux/memblock.h
+++ b/include/linux/memblock.h
@@ -77,16 +77,12 @@ struct memblock_type {
  * @current_limit: physical address of the current allocation limit
  * @memory: usable memory regions
  * @reserved: reserved memory regions
- * @physmem: all physical memory
  */
 struct memblock {
 	bool bottom_up;  /* is bottom up direction? */
 	phys_addr_t current_limit;
 	struct memblock_type memory;
 	struct memblock_type reserved;
-#ifdef CONFIG_HAVE_MEMBLOCK_PHYS_MAP
-	struct memblock_type physmem;
-#endif
 };
 
 extern struct memblock memblock;
@@ -145,6 +141,30 @@ void __next_reserved_mem_region(u64 *idx, phys_addr_t *out_start,
 
 void __memblock_free_late(phys_addr_t base, phys_addr_t size);
 
+#ifdef CONFIG_HAVE_MEMBLOCK_PHYS_MAP
+static inline void __next_physmem_range(u64 *idx, struct memblock_type *type,
+					phys_addr_t *out_start,
+					phys_addr_t *out_end)
+{
+	extern struct memblock_type physmem;
+
+	__next_mem_range(idx, NUMA_NO_NODE, MEMBLOCK_NONE, &physmem, type,
+			 out_start, out_end, NULL);
+}
+
+/**
+ * for_each_physmem_range - iterate through physmem areas not included in type.
+ * @i: u64 used as loop variable
+ * @type: ptr to memblock_type which excludes from the iteration, can be %NULL
+ * @p_start: ptr to phys_addr_t for start address of the range, can be %NULL
+ * @p_end: ptr to phys_addr_t for end address of the range, can be %NULL
+ */
+#define for_each_physmem_range(i, type, p_start, p_end)			\
+	for (i = 0, __next_physmem_range(&i, type, p_start, p_end);	\
+	     i != (u64)ULLONG_MAX;					\
+	     __next_physmem_range(&i, type, p_start, p_end))
+#endif /* CONFIG_HAVE_MEMBLOCK_PHYS_MAP */
+
 /**
  * for_each_mem_range - iterate through memblock areas from type_a and not
  * included in type_b. Or just type_a if type_b is NULL.
diff --git a/mm/memblock.c b/mm/memblock.c
index 39aceafc57f6..45f198750be9 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -44,19 +44,20 @@
  *   in the system, for instance when the memory is restricted with
  *   ``mem=`` command line parameter
  * * ``reserved`` - describes the regions that were allocated
- * * ``physmap`` - describes the actual physical memory regardless of
- *   the possible restrictions; the ``physmap`` type is only available
- *   on some architectures.
+ * * ``physmem`` - describes the actual physical memory available during
+ *   boot regardless of the possible restrictions and memory hot(un)plug;
+ *   the ``physmem`` type is only available on some architectures.
  *
  * Each region is represented by :c:type:`struct memblock_region` that
  * defines the region extents, its attributes and NUMA node id on NUMA
  * systems. Every memory type is described by the :c:type:`struct
  * memblock_type` which contains an array of memory regions along with
- * the allocator metadata. The memory types are nicely wrapped with
- * :c:type:`struct memblock`. This structure is statically initialzed
- * at build time. The region arrays for the "memory" and "reserved"
- * types are initially sized to %INIT_MEMBLOCK_REGIONS and for the
- * "physmap" type to %INIT_PHYSMEM_REGIONS.
+ * the allocator metadata. The "memory" and "reserved" types are nicely
+ * wrapped with :c:type:`struct memblock`. This structure is statically
+ * initialized at build time. The region arrays are initially sized to
+ * %INIT_MEMBLOCK_REGIONS for "memory" and %INIT_MEMBLOCK_RESERVED_REGIONS
+ * for "reserved". The region array for "physmem" is initially sized to
+ * %INIT_PHYSMEM_REGIONS.
  * The memblock_allow_resize() enables automatic resizing of the region
  * arrays during addition of new regions. This feature should be used
  * with care so that memory allocated for the region array will not
@@ -87,8 +88,8 @@
  * function frees all the memory to the buddy page allocator.
  *
  * Unless an architecture enables %CONFIG_ARCH_KEEP_MEMBLOCK, the
- * memblock data structures will be discarded after the system
- * initialization completes.
+ * memblock data structures (except "physmem") will be discarded after the
+ * system initialization completes.
  */
 
 #ifndef CONFIG_NEED_MULTIPLE_NODES
@@ -104,7 +105,7 @@ unsigned long long max_possible_pfn;
 static struct memblock_region memblock_memory_init_regions[INIT_MEMBLOCK_REGIONS] __initdata_memblock;
 static struct memblock_region memblock_reserved_init_regions[INIT_MEMBLOCK_RESERVED_REGIONS] __initdata_memblock;
 #ifdef CONFIG_HAVE_MEMBLOCK_PHYS_MAP
-static struct memblock_region memblock_physmem_init_regions[INIT_PHYSMEM_REGIONS] __initdata_memblock;
+static struct memblock_region memblock_physmem_init_regions[INIT_PHYSMEM_REGIONS];
 #endif
 
 struct memblock memblock __initdata_memblock = {
@@ -118,17 +119,19 @@ struct memblock memblock __initdata_memblock = {
 	.reserved.max		= INIT_MEMBLOCK_RESERVED_REGIONS,
 	.reserved.name		= "reserved",
 
-#ifdef CONFIG_HAVE_MEMBLOCK_PHYS_MAP
-	.physmem.regions	= memblock_physmem_init_regions,
-	.physmem.cnt		= 1,	/* empty dummy entry */
-	.physmem.max		= INIT_PHYSMEM_REGIONS,
-	.physmem.name		= "physmem",
-#endif
-
 	.bottom_up		= false,
 	.current_limit		= MEMBLOCK_ALLOC_ANYWHERE,
 };
 
+#ifdef CONFIG_HAVE_MEMBLOCK_PHYS_MAP
+struct memblock_type physmem = {
+	.regions		= memblock_physmem_init_regions,
+	.cnt			= 1,	/* empty dummy entry */
+	.max			= INIT_PHYSMEM_REGIONS,
+	.name			= "physmem",
+};
+#endif
+
 int memblock_debug __initdata_memblock;
 static bool system_has_some_mirror __initdata_memblock = false;
 static int memblock_can_resize __initdata_memblock;
@@ -838,7 +841,7 @@ int __init_memblock memblock_physmem_add(phys_addr_t base, phys_addr_t size)
 	memblock_dbg("%s: [%pa-%pa] %pS\n", __func__,
 		     &base, &end, (void *)_RET_IP_);
 
-	return memblock_add_range(&memblock.physmem, base, size, MAX_NUMNODES, 0);
+	return memblock_add_range(&physmem, base, size, MAX_NUMNODES, 0);
 }
 #endif
 
@@ -1019,12 +1022,10 @@ static bool should_skip_region(struct memblock_region *m, int nid, int flags)
  * As both region arrays are sorted, the function advances the two indices
  * in lockstep and returns each intersection.
  */
-void __init_memblock __next_mem_range(u64 *idx, int nid,
-				      enum memblock_flags flags,
-				      struct memblock_type *type_a,
-				      struct memblock_type *type_b,
-				      phys_addr_t *out_start,
-				      phys_addr_t *out_end, int *out_nid)
+void __next_mem_range(u64 *idx, int nid, enum memblock_flags flags,
+		      struct memblock_type *type_a,
+		      struct memblock_type *type_b, phys_addr_t *out_start,
+		      phys_addr_t *out_end, int *out_nid)
 {
 	int idx_a = *idx & 0xffffffff;
 	int idx_b = *idx >> 32;
@@ -1924,7 +1925,7 @@ void __init_memblock __memblock_dump_all(void)
 	memblock_dump(&memblock.memory);
 	memblock_dump(&memblock.reserved);
 #ifdef CONFIG_HAVE_MEMBLOCK_PHYS_MAP
-	memblock_dump(&memblock.physmem);
+	memblock_dump(&physmem);
 #endif
 }
 
@@ -2064,8 +2065,8 @@ static int __init memblock_init_debugfs(void)
 	debugfs_create_file("reserved", 0444, root,
 			    &memblock.reserved, &memblock_debug_fops);
 #ifdef CONFIG_HAVE_MEMBLOCK_PHYS_MAP
-	debugfs_create_file("physmem", 0444, root,
-			    &memblock.physmem, &memblock_debug_fops);
+	debugfs_create_file("physmem", 0444, root, &physmem,
+			    &memblock_debug_fops);
 #endif
 
 	return 0;

From fa49066fc326b78e7141d68387179f8968e0e1f0 Mon Sep 17 00:00:00 2001
From: David Hildenbrand <david@redhat.com>
Date: Wed, 1 Jul 2020 16:18:30 +0200
Subject: [PATCH 324/502] s390/mm: don't set ARCH_KEEP_MEMBLOCK

Commit 50be63450728 ("s390/mm: Convert bootmem to memblock") mentions
	"The original bootmem allocator is getting replaced by memblock. To
	cover the needs of the s390 kdump implementation the physical
	memory list is used."

As we can now reference "physmem" managed in the memblock allocator after
init even without ARCH_KEEP_MEMBLOCK, and s390x does no longer need
other memblock metadata after boot (esp., the zcore memmap device that used
it got removed), we can stop setting ARCH_KEEP_MEMBLOCK.

With this change, we no longer create memblocks for standby/hotplugged
memory (added via add_memory()) and free up memblock metadata (except
physmem) after boot.

Cc: Vasily Gorbik <gor@linux.ibm.com>
Cc: Christian Borntraeger <borntraeger@de.ibm.com>
Cc: Philipp Rudo <prudo@linux.ibm.com>
Cc: Mike Rapoport <rppt@linux.ibm.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: David Hildenbrand <david@redhat.com>
Message-Id: <20200701141830.18749-3-david@redhat.com>
Signed-off-by: Heiko Carstens <hca@linux.ibm.com>
---
 arch/s390/Kconfig | 1 -
 1 file changed, 1 deletion(-)

diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig
index c7d7ede6300c..7697a1f8e819 100644
--- a/arch/s390/Kconfig
+++ b/arch/s390/Kconfig
@@ -102,7 +102,6 @@ config S390
 	select ARCH_INLINE_WRITE_UNLOCK_BH
 	select ARCH_INLINE_WRITE_UNLOCK_IRQ
 	select ARCH_INLINE_WRITE_UNLOCK_IRQRESTORE
-	select ARCH_KEEP_MEMBLOCK
 	select ARCH_STACKWALK
 	select ARCH_SUPPORTS_ATOMIC_RMW
 	select ARCH_SUPPORTS_NUMA_BALANCING

From c8337c47deb9338417c61e7a6ba7de690eb1d300 Mon Sep 17 00:00:00 2001
From: Harald Freudenberger <freude@linux.ibm.com>
Date: Wed, 1 Jul 2020 12:40:39 +0200
Subject: [PATCH 325/502] s390/ap: rework crypto config info and default domain
 code

Rework of the QCI crypto info and how it is used.
This is only a internal rework but does not affect the way
how the ap bus acts with ap card and queue devices and
domain handling.

Tested on z15, z14, z12 (QCI support) and z196 (no QCI support).

Signed-off-by: Harald Freudenberger <freude@linux.ibm.com>
Signed-off-by: Heiko Carstens <hca@linux.ibm.com>
---
 drivers/s390/crypto/ap_bus.c | 305 ++++++++++++++++++-----------------
 1 file changed, 155 insertions(+), 150 deletions(-)

diff --git a/drivers/s390/crypto/ap_bus.c b/drivers/s390/crypto/ap_bus.c
index 64fa66788194..f218a0b67ed5 100644
--- a/drivers/s390/crypto/ap_bus.c
+++ b/drivers/s390/crypto/ap_bus.c
@@ -73,8 +73,7 @@ EXPORT_SYMBOL(ap_perms);
 DEFINE_MUTEX(ap_perms_mutex);
 EXPORT_SYMBOL(ap_perms_mutex);
 
-static struct ap_config_info *ap_configuration;
-static bool initialised;
+static struct ap_config_info *ap_qci_info;
 
 /*
  * AP bus related debug feature things.
@@ -105,8 +104,10 @@ static struct hrtimer ap_poll_timer;
  */
 static unsigned long long poll_timeout = 250000;
 
-/* Maximum domain id */
-static int ap_max_domain_id;
+/* Maximum domain id, if not given via qci */
+static int ap_max_domain_id = 15;
+/* Maximum adapter id, if not given via qci */
+static int ap_max_adapter_id = 63;
 
 static struct bus_type ap_bus_type;
 
@@ -154,12 +155,12 @@ static int ap_interrupts_available(void)
 }
 
 /**
- * ap_configuration_available(): Test if AP configuration
- * information is available.
+ * ap_qci_available(): Test if AP configuration
+ * information can be queried via QCI subfunction.
  *
- * Returns 1 if AP configuration information is available.
+ * Returns 1 if subfunction PQAP(QCI) is available.
  */
-static int ap_configuration_available(void)
+static int ap_qci_available(void)
 {
 	return test_facility(12);
 }
@@ -182,22 +183,22 @@ static int ap_apft_available(void)
  */
 static inline int ap_qact_available(void)
 {
-	if (ap_configuration)
-		return ap_configuration->qact;
+	if (ap_qci_info)
+		return ap_qci_info->qact;
 	return 0;
 }
 
 /*
- * ap_query_configuration(): Fetch cryptographic config info
+ * ap_fetch_qci_info(): Fetch cryptographic config info
  *
  * Returns the ap configuration info fetched via PQAP(QCI).
  * On success 0 is returned, on failure a negative errno
  * is returned, e.g. if the PQAP(QCI) instruction is not
  * available, the return value will be -EOPNOTSUPP.
  */
-static inline int ap_query_configuration(struct ap_config_info *info)
+static inline int ap_fetch_qci_info(struct ap_config_info *info)
 {
-	if (!ap_configuration_available())
+	if (!ap_qci_available())
 		return -EOPNOTSUPP;
 	if (!info)
 		return -EINVAL;
@@ -205,20 +206,39 @@ static inline int ap_query_configuration(struct ap_config_info *info)
 }
 
 /**
- * ap_init_configuration(): Allocate and query configuration array.
- */
-static void ap_init_configuration(void)
-{
-	if (!ap_configuration_available())
-		return;
+ * ap_init_qci_info(): Allocate and query qci config info.
+ * Does also update the static variables ap_max_domain_id
+ * and ap_max_adapter_id if this info is available.
 
-	ap_configuration = kzalloc(sizeof(*ap_configuration), GFP_KERNEL);
-	if (!ap_configuration)
+ */
+static void __init ap_init_qci_info(void)
+{
+	if (!ap_qci_available()) {
+		AP_DBF(DBF_INFO, "%s QCI not supported\n", __func__);
 		return;
-	if (ap_query_configuration(ap_configuration) != 0) {
-		kfree(ap_configuration);
-		ap_configuration = NULL;
+	}
+
+	ap_qci_info = kzalloc(sizeof(*ap_qci_info), GFP_KERNEL);
+	if (!ap_qci_info)
 		return;
+	if (ap_fetch_qci_info(ap_qci_info) != 0) {
+		kfree(ap_qci_info);
+		ap_qci_info = NULL;
+		return;
+	}
+	AP_DBF(DBF_INFO, "%s successful fetched initial qci info\n", __func__);
+
+	if (ap_qci_info->apxa) {
+		if (ap_qci_info->Na) {
+			ap_max_adapter_id = ap_qci_info->Na;
+			AP_DBF(DBF_INFO, "%s new ap_max_adapter_id is %d\n",
+			       __func__, ap_max_adapter_id);
+		}
+		if (ap_qci_info->Nd) {
+			ap_max_domain_id = ap_qci_info->Nd;
+			AP_DBF(DBF_INFO, "%s new ap_max_domain_id is %d\n",
+			       __func__, ap_max_domain_id);
+		}
 	}
 }
 
@@ -233,7 +253,6 @@ static inline int ap_test_config(unsigned int *field, unsigned int nr)
 
 /*
  * ap_test_config_card_id(): Test, whether an AP card ID is configured.
- * @id AP card ID
  *
  * Returns 0 if the card is not configured
  *	   1 if the card is configured or
@@ -241,16 +260,16 @@ static inline int ap_test_config(unsigned int *field, unsigned int nr)
  */
 static inline int ap_test_config_card_id(unsigned int id)
 {
-	if (!ap_configuration)	/* QCI not supported */
-		/* only ids 0...3F may be probed */
-		return id < 0x40 ? 1 : 0;
-	return ap_test_config(ap_configuration->apm, id);
+	if (id > ap_max_adapter_id)
+		return 0;
+	if (ap_qci_info)
+		return ap_test_config(ap_qci_info->apm, id);
+	return 1;
 }
 
 /*
  * ap_test_config_usage_domain(): Test, whether an AP usage domain
  * is configured.
- * @domain AP usage domain ID
  *
  * Returns 0 if the usage domain is not configured
  *	   1 if the usage domain is configured or
@@ -258,9 +277,11 @@ static inline int ap_test_config_card_id(unsigned int id)
  */
 int ap_test_config_usage_domain(unsigned int domain)
 {
-	if (!ap_configuration)	/* QCI not supported */
-		return domain < 16;
-	return ap_test_config(ap_configuration->aqm, domain);
+	if (domain > ap_max_domain_id)
+		return 0;
+	if (ap_qci_info)
+		return ap_test_config(ap_qci_info->aqm, domain);
+	return 1;
 }
 EXPORT_SYMBOL(ap_test_config_usage_domain);
 
@@ -274,43 +295,44 @@ EXPORT_SYMBOL(ap_test_config_usage_domain);
  */
 int ap_test_config_ctrl_domain(unsigned int domain)
 {
-	if (!ap_configuration)	/* QCI not supported */
+	if (!ap_qci_info || domain > ap_max_domain_id)
 		return 0;
-	return ap_test_config(ap_configuration->adm, domain);
+	return ap_test_config(ap_qci_info->adm, domain);
 }
 EXPORT_SYMBOL(ap_test_config_ctrl_domain);
 
-/**
- * ap_query_queue(): Check if an AP queue is available.
- * @qid: The AP queue number
- * @queue_depth: Pointer to queue depth value
- * @device_type: Pointer to device type value
- * @facilities: Pointer to facility indicator
+/*
+ * ap_queue_info(): Check and get AP queue info.
+ * Returns true if TAPQ succeeded and the info is filled or
+ * false otherwise.
  */
-static int ap_query_queue(ap_qid_t qid, int *queue_depth, int *device_type,
-			  unsigned int *facilities)
+static bool ap_queue_info(ap_qid_t qid, int *q_type,
+			  unsigned int *q_fac, int *q_depth)
 {
 	struct ap_queue_status status;
-	unsigned long info;
-	int nd;
+	unsigned long info = 0;
 
-	if (!ap_test_config_card_id(AP_QID_CARD(qid)))
-		return -ENODEV;
+	/* make sure we don't run into a specifiation exception */
+	if (AP_QID_CARD(qid) > ap_max_adapter_id ||
+	    AP_QID_QUEUE(qid) > ap_max_domain_id)
+		return false;
 
+	/* call TAPQ on this APQN */
 	status = ap_test_queue(qid, ap_apft_available(), &info);
 	switch (status.response_code) {
 	case AP_RESPONSE_NORMAL:
-		*queue_depth = (int)(info & 0xff);
-		*device_type = (int)((info >> 24) & 0xff);
-		*facilities = (unsigned int)(info >> 32);
-		/* Update maximum domain id */
-		nd = (info >> 16) & 0xff;
-		/* if N bit is available, z13 and newer */
-		if ((info & (1UL << 57)) && nd > 0)
-			ap_max_domain_id = nd;
-		else /* older machine types */
-			ap_max_domain_id = 15;
-		switch (*device_type) {
+	case AP_RESPONSE_RESET_IN_PROGRESS:
+		/*
+		 * According to the architecture in all these cases the
+		 * info should be filled. All bits 0 is not possible as
+		 * there is at least one of the mode bits set.
+		 */
+		if (WARN_ON_ONCE(!info))
+			return false;
+		*q_type = (int)((info >> 24) & 0xff);
+		*q_fac = (unsigned int)(info >> 32);
+		*q_depth = (int)(info & 0xff);
+		switch (*q_type) {
 			/* For CEX2 and CEX3 the available functions
 			 * are not reflected by the facilities bits.
 			 * Instead it is coded into the type. So here
@@ -318,27 +340,21 @@ static int ap_query_queue(ap_qid_t qid, int *queue_depth, int *device_type,
 			 */
 		case AP_DEVICE_TYPE_CEX2A:
 		case AP_DEVICE_TYPE_CEX3A:
-			*facilities |= 0x08000000;
+			*q_fac |= 0x08000000;
 			break;
 		case AP_DEVICE_TYPE_CEX2C:
 		case AP_DEVICE_TYPE_CEX3C:
-			*facilities |= 0x10000000;
+			*q_fac |= 0x10000000;
 			break;
 		default:
 			break;
 		}
-		return 0;
-	case AP_RESPONSE_Q_NOT_AVAIL:
-	case AP_RESPONSE_DECONFIGURED:
-	case AP_RESPONSE_CHECKSTOPPED:
-	case AP_RESPONSE_INVALID_ADDRESS:
-		return -ENODEV;
-	case AP_RESPONSE_RESET_IN_PROGRESS:
-	case AP_RESPONSE_OTHERWISE_CHANGED:
-	case AP_RESPONSE_BUSY:
-		return -EBUSY;
+		return true;
 	default:
-		BUG();
+		/*
+		 * A response code which indicates, there is no info available.
+		 */
+		return false;
 	}
 }
 
@@ -751,9 +767,6 @@ int ap_driver_register(struct ap_driver *ap_drv, struct module *owner,
 {
 	struct device_driver *drv = &ap_drv->driver;
 
-	if (!initialised)
-		return -ENODEV;
-
 	drv->bus = &ap_bus_type;
 	drv->probe = ap_device_probe;
 	drv->remove = ap_device_remove;
@@ -929,11 +942,12 @@ static ssize_t ap_domain_store(struct bus_type *bus,
 	    domain < 0 || domain > ap_max_domain_id ||
 	    !test_bit_inv(domain, ap_perms.aqm))
 		return -EINVAL;
+
 	spin_lock_bh(&ap_domain_lock);
 	ap_domain_index = domain;
 	spin_unlock_bh(&ap_domain_lock);
 
-	AP_DBF(DBF_DEBUG, "stored new default domain=%d\n", domain);
+	AP_DBF(DBF_INFO, "stored new default domain=%d\n", domain);
 
 	return count;
 }
@@ -942,45 +956,45 @@ static BUS_ATTR_RW(ap_domain);
 
 static ssize_t ap_control_domain_mask_show(struct bus_type *bus, char *buf)
 {
-	if (!ap_configuration)	/* QCI not supported */
+	if (!ap_qci_info)	/* QCI not supported */
 		return scnprintf(buf, PAGE_SIZE, "not supported\n");
 
 	return scnprintf(buf, PAGE_SIZE,
 			 "0x%08x%08x%08x%08x%08x%08x%08x%08x\n",
-			 ap_configuration->adm[0], ap_configuration->adm[1],
-			 ap_configuration->adm[2], ap_configuration->adm[3],
-			 ap_configuration->adm[4], ap_configuration->adm[5],
-			 ap_configuration->adm[6], ap_configuration->adm[7]);
+			 ap_qci_info->adm[0], ap_qci_info->adm[1],
+			 ap_qci_info->adm[2], ap_qci_info->adm[3],
+			 ap_qci_info->adm[4], ap_qci_info->adm[5],
+			 ap_qci_info->adm[6], ap_qci_info->adm[7]);
 }
 
 static BUS_ATTR_RO(ap_control_domain_mask);
 
 static ssize_t ap_usage_domain_mask_show(struct bus_type *bus, char *buf)
 {
-	if (!ap_configuration)	/* QCI not supported */
+	if (!ap_qci_info)	/* QCI not supported */
 		return scnprintf(buf, PAGE_SIZE, "not supported\n");
 
 	return scnprintf(buf, PAGE_SIZE,
 			 "0x%08x%08x%08x%08x%08x%08x%08x%08x\n",
-			 ap_configuration->aqm[0], ap_configuration->aqm[1],
-			 ap_configuration->aqm[2], ap_configuration->aqm[3],
-			 ap_configuration->aqm[4], ap_configuration->aqm[5],
-			 ap_configuration->aqm[6], ap_configuration->aqm[7]);
+			 ap_qci_info->aqm[0], ap_qci_info->aqm[1],
+			 ap_qci_info->aqm[2], ap_qci_info->aqm[3],
+			 ap_qci_info->aqm[4], ap_qci_info->aqm[5],
+			 ap_qci_info->aqm[6], ap_qci_info->aqm[7]);
 }
 
 static BUS_ATTR_RO(ap_usage_domain_mask);
 
 static ssize_t ap_adapter_mask_show(struct bus_type *bus, char *buf)
 {
-	if (!ap_configuration)	/* QCI not supported */
+	if (!ap_qci_info)	/* QCI not supported */
 		return scnprintf(buf, PAGE_SIZE, "not supported\n");
 
 	return scnprintf(buf, PAGE_SIZE,
 			 "0x%08x%08x%08x%08x%08x%08x%08x%08x\n",
-			 ap_configuration->apm[0], ap_configuration->apm[1],
-			 ap_configuration->apm[2], ap_configuration->apm[3],
-			 ap_configuration->apm[4], ap_configuration->apm[5],
-			 ap_configuration->apm[6], ap_configuration->apm[7]);
+			 ap_qci_info->apm[0], ap_qci_info->apm[1],
+			 ap_qci_info->apm[2], ap_qci_info->apm[3],
+			 ap_qci_info->apm[4], ap_qci_info->apm[5],
+			 ap_qci_info->apm[6], ap_qci_info->apm[7]);
 }
 
 static BUS_ATTR_RO(ap_adapter_mask);
@@ -1066,17 +1080,18 @@ static BUS_ATTR_RW(poll_timeout);
 
 static ssize_t ap_max_domain_id_show(struct bus_type *bus, char *buf)
 {
-	int max_domain_id;
-
-	if (ap_configuration)
-		max_domain_id = ap_max_domain_id ? : -1;
-	else
-		max_domain_id = 15;
-	return scnprintf(buf, PAGE_SIZE, "%d\n", max_domain_id);
+	return scnprintf(buf, PAGE_SIZE, "%d\n", ap_max_domain_id);
 }
 
 static BUS_ATTR_RO(ap_max_domain_id);
 
+static ssize_t ap_max_adapter_id_show(struct bus_type *bus, char *buf)
+{
+	return scnprintf(buf, PAGE_SIZE, "%d\n", ap_max_adapter_id);
+}
+
+static BUS_ATTR_RO(ap_max_adapter_id);
+
 static ssize_t apmask_show(struct bus_type *bus, char *buf)
 {
 	int rc;
@@ -1149,6 +1164,7 @@ static struct bus_attribute *const ap_bus_attrs[] = {
 	&bus_attr_ap_interrupts,
 	&bus_attr_poll_timeout,
 	&bus_attr_ap_max_domain_id,
+	&bus_attr_ap_max_adapter_id,
 	&bus_attr_apmask,
 	&bus_attr_aqmask,
 	NULL,
@@ -1160,47 +1176,42 @@ static struct bus_attribute *const ap_bus_attrs[] = {
  */
 static void ap_select_domain(void)
 {
-	int count, max_count, best_domain;
 	struct ap_queue_status status;
-	int i, j;
+	int card, dom;
 
 	/*
-	 * We want to use a single domain. Either the one specified with
-	 * the "domain=" parameter or the domain with the maximum number
-	 * of devices.
+	 * Choose the default domain. Either the one specified with
+	 * the "domain=" parameter or the first domain with at least
+	 * one valid APQN.
 	 */
 	spin_lock_bh(&ap_domain_lock);
 	if (ap_domain_index >= 0) {
 		/* Domain has already been selected. */
-		spin_unlock_bh(&ap_domain_lock);
-		return;
+		goto out;
 	}
-	best_domain = -1;
-	max_count = 0;
-	for (i = 0; i < AP_DOMAINS; i++) {
-		if (!ap_test_config_usage_domain(i) ||
-		    !test_bit_inv(i, ap_perms.aqm))
+	for (dom = 0; dom <= ap_max_domain_id; dom++) {
+		if (!ap_test_config_usage_domain(dom) ||
+		    !test_bit_inv(dom, ap_perms.aqm))
 			continue;
-		count = 0;
-		for (j = 0; j < AP_DEVICES; j++) {
-			if (!ap_test_config_card_id(j))
+		for (card = 0; card <= ap_max_adapter_id; card++) {
+			if (!ap_test_config_card_id(card) ||
+			    !test_bit_inv(card, ap_perms.apm))
 				continue;
-			status = ap_test_queue(AP_MKQID(j, i),
+			status = ap_test_queue(AP_MKQID(card, dom),
 					       ap_apft_available(),
 					       NULL);
-			if (status.response_code != AP_RESPONSE_NORMAL)
-				continue;
-			count++;
-		}
-		if (count > max_count) {
-			max_count = count;
-			best_domain = i;
+			if (status.response_code == AP_RESPONSE_NORMAL)
+				break;
 		}
+		if (card <= ap_max_adapter_id)
+			break;
 	}
-	if (best_domain >= 0) {
-		ap_domain_index = best_domain;
-		AP_DBF(DBF_DEBUG, "new ap_domain_index=%d\n", ap_domain_index);
+	if (dom <= ap_max_domain_id) {
+		ap_domain_index = dom;
+		AP_DBF(DBF_DEBUG, "%s new default domain is %d\n",
+		       __func__, ap_domain_index);
 	}
+out:
 	spin_unlock_bh(&ap_domain_lock);
 }
 
@@ -1279,12 +1290,13 @@ static int __match_queue_device_with_queue_id(struct device *dev, const void *da
  */
 static void _ap_scan_bus_adapter(int id)
 {
+	bool broken;
 	ap_qid_t qid;
 	unsigned int func;
 	struct ap_card *ac;
 	struct device *dev;
 	struct ap_queue *aq;
-	int rc, dom, depth, type, comp_type, borked;
+	int rc, dom, depth, type, comp_type;
 
 	/* check if there is a card device registered with this id */
 	dev = bus_find_device(&ap_bus_type, NULL,
@@ -1312,23 +1324,23 @@ static void _ap_scan_bus_adapter(int id)
 		/* find the first valid queue */
 		for (dom = 0; dom < AP_DOMAINS; dom++) {
 			qid = AP_MKQID(id, dom);
-			if (ap_query_queue(qid, &depth, &type, &func) == 0)
+			if (ap_queue_info(qid, &type, &func, &depth))
 				break;
 		}
-		borked = 0;
+		broken = false;
 		if (dom >= AP_DOMAINS) {
 			/* no accessible queue on this card */
-			borked = 1;
+			broken = true;
 		} else if (ac->raw_hwtype != type) {
 			/* card type has changed */
 			AP_DBF(DBF_INFO, "card=%02x type changed.\n", id);
-			borked = 1;
+			broken = true;
 		} else if (ac->functions != func) {
 			/* card functions have changed */
 			AP_DBF(DBF_INFO, "card=%02x functions changed.\n", id);
-			borked = 1;
+			broken = true;
 		}
-		if (borked) {
+		if (broken) {
 			/* unregister card device and associated queues */
 			bus_for_each_dev(&ap_bus_type, NULL,
 					 (void *)(long) id,
@@ -1364,16 +1376,14 @@ static void _ap_scan_bus_adapter(int id)
 			continue;
 		}
 		/* try to fetch infos about this queue */
-		rc = ap_query_queue(qid, &depth, &type, &func);
+		broken = !ap_queue_info(qid, &type, &func, &depth);
 		if (dev) {
-			if (rc == -ENODEV)
-				borked = 1;
-			else {
+			if (!broken) {
 				spin_lock_bh(&aq->lock);
-				borked = aq->sm_state == AP_SM_STATE_BORKED;
+				broken = aq->sm_state == AP_SM_STATE_BORKED;
 				spin_unlock_bh(&aq->lock);
 			}
-			if (borked) {
+			if (broken) {
 				/* Remove broken device */
 				AP_DBF(DBF_DEBUG,
 				       "removing broken queue=%02x.%04x\n",
@@ -1383,7 +1393,7 @@ static void _ap_scan_bus_adapter(int id)
 			put_device(dev);
 			continue;
 		}
-		if (rc)
+		if (broken)
 			continue;
 		/* a new queue device is needed, check out comp type */
 		comp_type = ap_get_compatible_type(qid, type, func);
@@ -1435,11 +1445,11 @@ static void ap_scan_bus(struct work_struct *unused)
 {
 	int id;
 
-	AP_DBF(DBF_DEBUG, "%s running\n", __func__);
-
-	ap_query_configuration(ap_configuration);
+	ap_fetch_qci_info(ap_qci_info);
 	ap_select_domain();
 
+	AP_DBF(DBF_DEBUG, "%s running\n", __func__);
+
 	/* loop over all possible adapters */
 	for (id = 0; id < AP_DEVICES; id++)
 		_ap_scan_bus_adapter(id);
@@ -1505,7 +1515,6 @@ static void __init ap_perms_init(void)
  */
 static int __init ap_module_init(void)
 {
-	int max_domain_id;
 	int rc, i;
 
 	rc = ap_debug_init();
@@ -1524,14 +1533,10 @@ static int __init ap_module_init(void)
 	ap_perms_init();
 
 	/* Get AP configuration data if available */
-	ap_init_configuration();
+	ap_init_qci_info();
 
-	if (ap_configuration)
-		max_domain_id =
-			ap_max_domain_id ? ap_max_domain_id : AP_DOMAINS - 1;
-	else
-		max_domain_id = 15;
-	if (ap_domain_index < -1 || ap_domain_index > max_domain_id ||
+	/* check default domain setting */
+	if (ap_domain_index < -1 || ap_domain_index > ap_max_domain_id ||
 	    (ap_domain_index >= 0 &&
 	     !test_bit_inv(ap_domain_index, ap_perms.aqm))) {
 		pr_warn("%d is not a valid cryptographic domain\n",
@@ -1539,6 +1544,7 @@ static int __init ap_module_init(void)
 		ap_domain_index = -1;
 	}
 
+	/* enable interrupts if available */
 	if (ap_interrupts_available()) {
 		rc = register_adapter_interrupt(&ap_airq);
 		ap_airq_flag = (rc == 0);
@@ -1581,7 +1587,6 @@ static int __init ap_module_init(void)
 	}
 
 	queue_work(system_long_wq, &ap_scan_work);
-	initialised = true;
 
 	return 0;
 
@@ -1595,7 +1600,7 @@ out_bus:
 out:
 	if (ap_using_interrupts())
 		unregister_adapter_interrupt(&ap_airq);
-	kfree(ap_configuration);
+	kfree(ap_qci_info);
 	return rc;
 }
 device_initcall(ap_module_init);

From 7b7735c5be473473d7a4b9e31460ed8e129dcb36 Mon Sep 17 00:00:00 2001
From: Christian Borntraeger <borntraeger@de.ibm.com>
Date: Tue, 7 Jul 2020 14:07:53 +0200
Subject: [PATCH 326/502] s390: fix comment regarding interrupts in svc

With the removal of the critical section cleanup, we now enter the svc
interrupt handler with interrupts disabled.

Fixes: 0b0ed657fe00 ("s390: remove critical section cleanup from entry.S")
Signed-off-by: Christian Borntraeger <borntraeger@de.ibm.com>
Signed-off-by: Heiko Carstens <hca@linux.ibm.com>
---
 arch/s390/kernel/entry.S | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/s390/kernel/entry.S b/arch/s390/kernel/entry.S
index 969b35b177dd..23edf196d3dc 100644
--- a/arch/s390/kernel/entry.S
+++ b/arch/s390/kernel/entry.S
@@ -370,7 +370,7 @@ EXPORT_SYMBOL(sie_exit)
 
 /*
  * SVC interrupt handler routine. System calls are synchronous events and
- * are executed with interrupts enabled.
+ * are entered with interrupts disabled.
  */
 
 ENTRY(system_call)

From 6589c93f99894e007a1260f009018effc958ab69 Mon Sep 17 00:00:00 2001
From: Sven Schnelle <svens@linux.ibm.com>
Date: Wed, 8 Jul 2020 11:21:25 +0200
Subject: [PATCH 327/502] s390: add trace events for idle enter/exit

Helpful for debugging.

Signed-off-by: Sven Schnelle <svens@linux.ibm.com>
Reviewed-by: Heiko Carstens <hca@linux.ibm.com>
Signed-off-by: Heiko Carstens <hca@linux.ibm.com>
---
 arch/s390/kernel/idle.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/arch/s390/kernel/idle.c b/arch/s390/kernel/idle.c
index 0d7fbdfe995a..88bb42ca5008 100644
--- a/arch/s390/kernel/idle.c
+++ b/arch/s390/kernel/idle.c
@@ -14,6 +14,7 @@
 #include <linux/init.h>
 #include <linux/cpu.h>
 #include <linux/sched/cputime.h>
+#include <trace/events/power.h>
 #include <asm/nmi.h>
 #include <asm/smp.h>
 #include "entry.h"
@@ -32,11 +33,12 @@ void enabled_wait(void)
 		PSW_MASK_IO | PSW_MASK_EXT | PSW_MASK_MCHECK;
 	clear_cpu_flag(CIF_NOHZ_DELAY);
 
+	trace_cpu_idle_rcuidle(1, smp_processor_id());
 	local_irq_save(flags);
 	/* Call the assembler magic in entry.S */
 	psw_idle(idle, psw_mask);
 	local_irq_restore(flags);
-
+	trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, smp_processor_id());
 
 	/* Account time spent with enabled wait psw loaded as idle time. */
 	write_seqcount_begin(&idle->seqcount);

From 61c11656b67b0a30f702f240aabe81fd93e702ac Mon Sep 17 00:00:00 2001
From: Zhenyu Ye <yezhenyu2@huawei.com>
Date: Fri, 10 Jul 2020 17:41:58 +0800
Subject: [PATCH 328/502] arm64: tlb: don't set the ttl value in
 flush_tlb_page_nosync

flush_tlb_page_nosync() may be called from pmd level, so we
can not set the ttl = 3 here.

The callstack is as follows:

	pmdp_set_access_flags
		ptep_set_access_flags
			flush_tlb_fix_spurious_fault
				flush_tlb_page
					flush_tlb_page_nosync

Fixes: e735b98a5fe0 ("arm64: Add tlbi_user_level TLB invalidation helper")
Reported-by: Catalin Marinas <catalin.marinas@arm.com>
Signed-off-by: Zhenyu Ye <yezhenyu2@huawei.com>
Link: https://lore.kernel.org/r/20200710094158.468-1-yezhenyu2@huawei.com
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 arch/arm64/include/asm/tlbflush.h | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/arch/arm64/include/asm/tlbflush.h b/arch/arm64/include/asm/tlbflush.h
index 39aed2efd21b..2cb275efcea3 100644
--- a/arch/arm64/include/asm/tlbflush.h
+++ b/arch/arm64/include/asm/tlbflush.h
@@ -209,9 +209,8 @@ static inline void flush_tlb_page_nosync(struct vm_area_struct *vma,
 	unsigned long addr = __TLBI_VADDR(uaddr, ASID(vma->vm_mm));
 
 	dsb(ishst);
-	/* This function is only called on a small page */
-	__tlbi_level(vale1is, addr, 3);
-	__tlbi_user_level(vale1is, addr, 3);
+	__tlbi(vale1is, addr);
+	__tlbi_user(vale1is, addr);
 }
 
 static inline void flush_tlb_page(struct vm_area_struct *vma,

From 028a342ec8e128c5d71548d1210f1dba1ae95332 Mon Sep 17 00:00:00 2001
From: Masahiro Yamada <masahiroy@kernel.org>
Date: Tue, 26 May 2020 21:38:07 +0900
Subject: [PATCH 329/502] m68k: Add arch/m68k/Kbuild

Use the standard obj-y form to specify the sub-directories under
arch/m68k/. No functional change intended.

Signed-off-by: Masahiro Yamada <masahiroy@kernel.org>
Acked-by: Greg Ungerer <gerg@linux-m68k.org>
Link: https://lore.kernel.org/r/20200526123810.301667-1-masahiroy@kernel.org
Signed-off-by: Geert Uytterhoeven <geert@linux-m68k.org>
---
 arch/m68k/Kbuild   | 19 +++++++++++++++++++
 arch/m68k/Makefile | 20 +-------------------
 2 files changed, 20 insertions(+), 19 deletions(-)
 create mode 100644 arch/m68k/Kbuild

diff --git a/arch/m68k/Kbuild b/arch/m68k/Kbuild
new file mode 100644
index 000000000000..7dc1398dd188
--- /dev/null
+++ b/arch/m68k/Kbuild
@@ -0,0 +1,19 @@
+# SPDX-License-Identifier: GPL-2.0-only
+obj-y				+= kernel/ mm/
+obj-$(CONFIG_Q40)		+= q40/
+obj-$(CONFIG_AMIGA)		+= amiga/
+obj-$(CONFIG_ATARI)		+= atari/
+obj-$(CONFIG_MAC)		+= mac/
+obj-$(CONFIG_HP300)		+= hp300/
+obj-$(CONFIG_APOLLO)		+= apollo/
+obj-$(CONFIG_MVME147)		+= mvme147/
+obj-$(CONFIG_MVME16x)		+= mvme16x/
+obj-$(CONFIG_BVME6000)		+= bvme6000/
+obj-$(CONFIG_SUN3X)		+= sun3x/ sun3/
+obj-$(CONFIG_SUN3)		+= sun3/ sun3/prom/
+obj-$(CONFIG_NATFEAT)		+= emu/
+obj-$(CONFIG_M68040)		+= fpsp040/
+obj-$(CONFIG_M68060)		+= ifpsp060/
+obj-$(CONFIG_M68KFPU_EMU)	+= math-emu/
+obj-$(CONFIG_M68000)		+= 68000/
+obj-$(CONFIG_COLDFIRE)		+= coldfire/
diff --git a/arch/m68k/Makefile b/arch/m68k/Makefile
index 0415d28dbe4f..e431015f5cc9 100644
--- a/arch/m68k/Makefile
+++ b/arch/m68k/Makefile
@@ -97,27 +97,9 @@ head-$(CONFIG_SUN3)		:= arch/m68k/kernel/sun3-head.o
 head-$(CONFIG_M68000)		:= arch/m68k/68000/head.o
 head-$(CONFIG_COLDFIRE)		:= arch/m68k/coldfire/head.o
 
-core-y				+= arch/m68k/kernel/	arch/m68k/mm/
+core-y				+= arch/m68k/
 libs-y				+= arch/m68k/lib/
 
-core-$(CONFIG_Q40)		+= arch/m68k/q40/
-core-$(CONFIG_AMIGA)		+= arch/m68k/amiga/
-core-$(CONFIG_ATARI)		+= arch/m68k/atari/
-core-$(CONFIG_MAC)		+= arch/m68k/mac/
-core-$(CONFIG_HP300)		+= arch/m68k/hp300/
-core-$(CONFIG_APOLLO)		+= arch/m68k/apollo/
-core-$(CONFIG_MVME147)		+= arch/m68k/mvme147/
-core-$(CONFIG_MVME16x)		+= arch/m68k/mvme16x/
-core-$(CONFIG_BVME6000)		+= arch/m68k/bvme6000/
-core-$(CONFIG_SUN3X)		+= arch/m68k/sun3x/	arch/m68k/sun3/
-core-$(CONFIG_SUN3)		+= arch/m68k/sun3/	arch/m68k/sun3/prom/
-core-$(CONFIG_NATFEAT)		+= arch/m68k/emu/
-core-$(CONFIG_M68040)		+= arch/m68k/fpsp040/
-core-$(CONFIG_M68060)		+= arch/m68k/ifpsp060/
-core-$(CONFIG_M68KFPU_EMU)	+= arch/m68k/math-emu/
-core-$(CONFIG_M68000)		+= arch/m68k/68000/
-core-$(CONFIG_COLDFIRE)		+= arch/m68k/coldfire/
-
 
 all:	zImage
 

From bd3ff3f1b69cdb315d91fef0fb9512af83ae579b Mon Sep 17 00:00:00 2001
From: Masahiro Yamada <masahiroy@kernel.org>
Date: Tue, 26 May 2020 21:38:08 +0900
Subject: [PATCH 330/502] m68k: sun3: Descend to prom from arch/m68k/sun3

Move prom/ to the more relevant Makefile.

Signed-off-by: Masahiro Yamada <masahiroy@kernel.org>
Link: https://lore.kernel.org/r/20200526123810.301667-2-masahiroy@kernel.org
Signed-off-by: Geert Uytterhoeven <geert@linux-m68k.org>
---
 arch/m68k/Kbuild        | 2 +-
 arch/m68k/sun3/Makefile | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/m68k/Kbuild b/arch/m68k/Kbuild
index 7dc1398dd188..18abb35c26a1 100644
--- a/arch/m68k/Kbuild
+++ b/arch/m68k/Kbuild
@@ -10,7 +10,7 @@ obj-$(CONFIG_MVME147)		+= mvme147/
 obj-$(CONFIG_MVME16x)		+= mvme16x/
 obj-$(CONFIG_BVME6000)		+= bvme6000/
 obj-$(CONFIG_SUN3X)		+= sun3x/ sun3/
-obj-$(CONFIG_SUN3)		+= sun3/ sun3/prom/
+obj-$(CONFIG_SUN3)		+= sun3/
 obj-$(CONFIG_NATFEAT)		+= emu/
 obj-$(CONFIG_M68040)		+= fpsp040/
 obj-$(CONFIG_M68060)		+= ifpsp060/
diff --git a/arch/m68k/sun3/Makefile b/arch/m68k/sun3/Makefile
index 9960c46d303c..4e99e17d82ea 100644
--- a/arch/m68k/sun3/Makefile
+++ b/arch/m68k/sun3/Makefile
@@ -5,4 +5,4 @@
 
 obj-y	:= sun3ints.o sun3dvma.o idprom.o
 
-obj-$(CONFIG_SUN3) += config.o mmu_emu.o leds.o dvma.o intersil.o
+obj-$(CONFIG_SUN3) += config.o mmu_emu.o leds.o dvma.o intersil.o prom/

From 2367b0264294a50cd2cd9d4c1270a9393f32038c Mon Sep 17 00:00:00 2001
From: Masahiro Yamada <masahiroy@kernel.org>
Date: Tue, 26 May 2020 21:38:09 +0900
Subject: [PATCH 331/502] m68k: Optimize cc-option calls for cpuflags-y

arch/m68k/Makefile computes lots of unneeded cc-option calls.

For example, if CONFIG_M5441x is not defined, there is not point in
evaluating the following compiler flag.

 cpuflags-$(CONFIG_M5441x)      := $(call cc-option,-mcpu=54455,-mcfv4e)

The result is set to cpuflags-, then thrown away.

The right hand side of ':=' is immediately expanded. Hence, all of the
16 calls for cc-option are evaluated. This is expensive since cc-option
invokes the compiler. This occurs even if you are not attempting to
build anything, like 'make ARCH=m68k help'.

Use '=' to expand the value _lazily_. The evaluation for cc-option is
delayed until $(cpuflags-y) is expanded. So, the cc-option test happens
just once at most.

This commit mimics tune-y of arch/arm/Makefile.

Signed-off-by: Masahiro Yamada <masahiroy@kernel.org>
Acked-by: Greg Ungerer <gerg@linux-m68k.org>
Link: https://lore.kernel.org/r/20200526123810.301667-3-masahiroy@kernel.org
Signed-off-by: Geert Uytterhoeven <geert@linux-m68k.org>
---
 arch/m68k/Makefile | 45 ++++++++++++++++++++++++---------------------
 1 file changed, 24 insertions(+), 21 deletions(-)

diff --git a/arch/m68k/Makefile b/arch/m68k/Makefile
index e431015f5cc9..0507bf297727 100644
--- a/arch/m68k/Makefile
+++ b/arch/m68k/Makefile
@@ -32,30 +32,33 @@ endif
 #	compiler cpu type flag.
 #
 ifndef CONFIG_M68040
-cpuflags-$(CONFIG_M68060)	:= -m68060
+cpuflags-$(CONFIG_M68060)	= -m68060
 endif
 ifndef CONFIG_M68060
-cpuflags-$(CONFIG_M68040)	:= -m68040
+cpuflags-$(CONFIG_M68040)	= -m68040
 endif
-cpuflags-$(CONFIG_M68030)	:=
-cpuflags-$(CONFIG_M68020)	:=
-cpuflags-$(CONFIG_M68000)	:= -m68000
-cpuflags-$(CONFIG_M5441x)	:= $(call cc-option,-mcpu=54455,-mcfv4e)
-cpuflags-$(CONFIG_M54xx)	:= $(call cc-option,-mcpu=5475,-m5200)
-cpuflags-$(CONFIG_M5407)	:= $(call cc-option,-mcpu=5407,-m5200)
-cpuflags-$(CONFIG_M532x)	:= $(call cc-option,-mcpu=532x,-m5307)
-cpuflags-$(CONFIG_M537x)	:= $(call cc-option,-mcpu=537x,-m5307)
-cpuflags-$(CONFIG_M5307)	:= $(call cc-option,-mcpu=5307,-m5200)
-cpuflags-$(CONFIG_M528x)	:= $(call cc-option,-mcpu=528x,-m5307)
-cpuflags-$(CONFIG_M5275)	:= $(call cc-option,-mcpu=5275,-m5307)
-cpuflags-$(CONFIG_M5272)	:= $(call cc-option,-mcpu=5272,-m5307)
-cpuflags-$(CONFIG_M5271)	:= $(call cc-option,-mcpu=5271,-m5307)
-cpuflags-$(CONFIG_M523x)	:= $(call cc-option,-mcpu=523x,-m5307)
-cpuflags-$(CONFIG_M525x)	:= $(call cc-option,-mcpu=5253,-m5200)
-cpuflags-$(CONFIG_M5249)	:= $(call cc-option,-mcpu=5249,-m5200)
-cpuflags-$(CONFIG_M520x)	:= $(call cc-option,-mcpu=5208,-m5200)
-cpuflags-$(CONFIG_M5206e)	:= $(call cc-option,-mcpu=5206e,-m5200)
-cpuflags-$(CONFIG_M5206)	:= $(call cc-option,-mcpu=5206,-m5200)
+cpuflags-$(CONFIG_M68030)	=
+cpuflags-$(CONFIG_M68020)	=
+cpuflags-$(CONFIG_M68000)	= -m68000
+cpuflags-$(CONFIG_M5441x)	= $(call cc-option,-mcpu=54455,-mcfv4e)
+cpuflags-$(CONFIG_M54xx)	= $(call cc-option,-mcpu=5475,-m5200)
+cpuflags-$(CONFIG_M5407)	= $(call cc-option,-mcpu=5407,-m5200)
+cpuflags-$(CONFIG_M532x)	= $(call cc-option,-mcpu=532x,-m5307)
+cpuflags-$(CONFIG_M537x)	= $(call cc-option,-mcpu=537x,-m5307)
+cpuflags-$(CONFIG_M5307)	= $(call cc-option,-mcpu=5307,-m5200)
+cpuflags-$(CONFIG_M528x)	= $(call cc-option,-mcpu=528x,-m5307)
+cpuflags-$(CONFIG_M5275)	= $(call cc-option,-mcpu=5275,-m5307)
+cpuflags-$(CONFIG_M5272)	= $(call cc-option,-mcpu=5272,-m5307)
+cpuflags-$(CONFIG_M5271)	= $(call cc-option,-mcpu=5271,-m5307)
+cpuflags-$(CONFIG_M523x)	= $(call cc-option,-mcpu=523x,-m5307)
+cpuflags-$(CONFIG_M525x)	= $(call cc-option,-mcpu=5253,-m5200)
+cpuflags-$(CONFIG_M5249)	= $(call cc-option,-mcpu=5249,-m5200)
+cpuflags-$(CONFIG_M520x)	= $(call cc-option,-mcpu=5208,-m5200)
+cpuflags-$(CONFIG_M5206e)	= $(call cc-option,-mcpu=5206e,-m5200)
+cpuflags-$(CONFIG_M5206)	= $(call cc-option,-mcpu=5206,-m5200)
+
+# Evaluate tune cc-option calls now
+cpuflags-y := $(cpuflags-y)
 
 KBUILD_AFLAGS += $(cpuflags-y)
 KBUILD_CFLAGS += $(cpuflags-y)

From 40b13fd7fd6e1ec295230cc114c6c9309e15784a Mon Sep 17 00:00:00 2001
From: Masahiro Yamada <masahiroy@kernel.org>
Date: Tue, 26 May 2020 21:38:10 +0900
Subject: [PATCH 332/502] m68k: Pass -D options to KBUILD_CPPFLAGS instead of
 KBUILD_{A,C}FLAGS

Precisely, -D is a preprocessor option.

KBUILD_CPPFLAGS is passed for compiling .c and .S files too.

Signed-off-by: Masahiro Yamada <masahiroy@kernel.org>
Acked-by: Greg Ungerer <gerg@linux-m68k.org>
Link: https://lore.kernel.org/r/20200526123810.301667-4-masahiroy@kernel.org
Signed-off-by: Geert Uytterhoeven <geert@linux-m68k.org>
---
 arch/m68k/Makefile | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/arch/m68k/Makefile b/arch/m68k/Makefile
index 0507bf297727..71ffaf5f8954 100644
--- a/arch/m68k/Makefile
+++ b/arch/m68k/Makefile
@@ -70,9 +70,8 @@ ifdef CONFIG_MMU
 KBUILD_CFLAGS += -fno-strength-reduce -ffixed-a2
 else
 # we can use a m68k-linux-gcc toolchain with these in place
-KBUILD_CFLAGS += -DUTS_SYSNAME=\"uClinux\"
-KBUILD_CFLAGS += -D__uClinux__
-KBUILD_AFLAGS += -D__uClinux__
+KBUILD_CPPFLAGS += -DUTS_SYSNAME=\"uClinux\"
+KBUILD_CPPFLAGS += -D__uClinux__
 endif
 
 KBUILD_LDFLAGS := -m m68kelf

From 5f5f2949c14d6fe5cfc51bd98a41fdf69652c7e3 Mon Sep 17 00:00:00 2001
From: "Gustavo A. R. Silva" <gustavoars@kernel.org>
Date: Wed, 27 May 2020 08:39:42 -0500
Subject: [PATCH 333/502] m68k: Use sizeof_field() helper

Make use of the sizeof_field() helper instead of an open-coded version.

Signed-off-by: Gustavo A. R. Silva <gustavoars@kernel.org>
Link: https://lore.kernel.org/r/20200527133942.GA10408@embeddedor
Signed-off-by: Geert Uytterhoeven <geert@linux-m68k.org>
---
 arch/m68k/kernel/signal.c | 32 ++++++++++++++++----------------
 1 file changed, 16 insertions(+), 16 deletions(-)

diff --git a/arch/m68k/kernel/signal.c b/arch/m68k/kernel/signal.c
index b3ff39588f36..fc034fd19798 100644
--- a/arch/m68k/kernel/signal.c
+++ b/arch/m68k/kernel/signal.c
@@ -61,25 +61,25 @@
 #define	FMT4SIZE	0
 #else
 #define	FORMAT		0
-#define	FMT4SIZE	sizeof(((struct frame *)0)->un.fmt4)
+#define	FMT4SIZE	sizeof_field(struct frame, un.fmt4)
 #endif
 
 static const int frame_size_change[16] = {
-  [1]	= -1, /* sizeof(((struct frame *)0)->un.fmt1), */
-  [2]	= sizeof(((struct frame *)0)->un.fmt2),
-  [3]	= sizeof(((struct frame *)0)->un.fmt3),
+  [1]	= -1, /* sizeof_field(struct frame, un.fmt1), */
+  [2]	= sizeof_field(struct frame, un.fmt2),
+  [3]	= sizeof_field(struct frame, un.fmt3),
   [4]	= FMT4SIZE,
-  [5]	= -1, /* sizeof(((struct frame *)0)->un.fmt5), */
-  [6]	= -1, /* sizeof(((struct frame *)0)->un.fmt6), */
-  [7]	= sizeof(((struct frame *)0)->un.fmt7),
-  [8]	= -1, /* sizeof(((struct frame *)0)->un.fmt8), */
-  [9]	= sizeof(((struct frame *)0)->un.fmt9),
-  [10]	= sizeof(((struct frame *)0)->un.fmta),
-  [11]	= sizeof(((struct frame *)0)->un.fmtb),
-  [12]	= -1, /* sizeof(((struct frame *)0)->un.fmtc), */
-  [13]	= -1, /* sizeof(((struct frame *)0)->un.fmtd), */
-  [14]	= -1, /* sizeof(((struct frame *)0)->un.fmte), */
-  [15]	= -1, /* sizeof(((struct frame *)0)->un.fmtf), */
+  [5]	= -1, /* sizeof_field(struct frame, un.fmt5), */
+  [6]	= -1, /* sizeof_field(struct frame, un.fmt6), */
+  [7]	= sizeof_field(struct frame, un.fmt7),
+  [8]	= -1, /* sizeof_field(struct frame, un.fmt8), */
+  [9]	= sizeof_field(struct frame, un.fmt9),
+  [10]	= sizeof_field(struct frame, un.fmta),
+  [11]	= sizeof_field(struct frame, un.fmtb),
+  [12]	= -1, /* sizeof_field(struct frame, un.fmtc), */
+  [13]	= -1, /* sizeof_field(struct frame, un.fmtd), */
+  [14]	= -1, /* sizeof_field(struct frame, un.fmte), */
+  [15]	= -1, /* sizeof_field(struct frame, un.fmtf), */
 };
 
 static inline int frame_extra_sizes(int f)
@@ -651,7 +651,7 @@ static int mangle_kernel_stack(struct pt_regs *regs, int formatvec,
 	} else {
 		struct switch_stack *sw = (struct switch_stack *)regs - 1;
 		/* yes, twice as much as max(sizeof(frame.un.fmt<x>)) */
-		unsigned long buf[sizeof(((struct frame *)0)->un) / 2];
+		unsigned long buf[sizeof_field(struct frame, un) / 2];
 
 		/* that'll make sure that expansion won't crap over data */
 		if (copy_from_user(buf + fsize / 4, fp, fsize))

From be1a31283655105606407502800871b9c1a1132f Mon Sep 17 00:00:00 2001
From: Michael Schmitz <schmitzmic@gmail.com>
Date: Sun, 31 May 2020 10:45:19 +1200
Subject: [PATCH 334/502] m68k: atari: Annotate dummy read in ROM port IO code
 as __maybe_unused

The Atari ROM port IO code uses dummy variables to implement writes
(not supported by the hardware) as reads that encode the write data
in part of the address. The value read from the ROM port in this
operation is discarded.

Annotate dummy variables as __maybe_unused to avoid a compiler warning
with W=1.

Reported-by: kbuild test robot <lkp@intel.com>
Signed-off-by: Michael Schmitz <schmitzmic@gmail.com>
Link: https://lore.kernel.org/r/1590878719-21219-1-git-send-email-schmitzmic@gmail.com
Signed-off-by: Geert Uytterhoeven <geert@linux-m68k.org>
---
 arch/m68k/include/asm/raw_io.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/arch/m68k/include/asm/raw_io.h b/arch/m68k/include/asm/raw_io.h
index 8a6dc6e5a279..911826ea83ce 100644
--- a/arch/m68k/include/asm/raw_io.h
+++ b/arch/m68k/include/asm/raw_io.h
@@ -80,14 +80,14 @@
 	({ u16 __v = le16_to_cpu(*(__force volatile u16 *) (addr)); __v; })
 
 #define rom_out_8(addr, b)	\
-	({u8 __w, __v = (b);  u32 _addr = ((u32) (addr)); \
+	({u8 __maybe_unused __w, __v = (b);  u32 _addr = ((u32) (addr)); \
 	__w = ((*(__force volatile u8 *)  ((_addr | 0x10000) + (__v<<1)))); })
 #define rom_out_be16(addr, w)	\
-	({u16 __w, __v = (w); u32 _addr = ((u32) (addr)); \
+	({u16 __maybe_unused __w, __v = (w); u32 _addr = ((u32) (addr)); \
 	__w = ((*(__force volatile u16 *) ((_addr & 0xFFFF0000UL) + ((__v & 0xFF)<<1)))); \
 	__w = ((*(__force volatile u16 *) ((_addr | 0x10000) + ((__v >> 8)<<1)))); })
 #define rom_out_le16(addr, w)	\
-	({u16 __w, __v = (w); u32 _addr = ((u32) (addr)); \
+	({u16 __maybe_unused __w, __v = (w); u32 _addr = ((u32) (addr)); \
 	__w = ((*(__force volatile u16 *) ((_addr & 0xFFFF0000UL) + ((__v >> 8)<<1)))); \
 	__w = ((*(__force volatile u16 *) ((_addr | 0x10000) + ((__v & 0xFF)<<1)))); })
 

From aeb445bf2194d83e12e85bf5c65baaf1f093bd8f Mon Sep 17 00:00:00 2001
From: Finn Thain <fthain@telegraphics.com.au>
Date: Sun, 31 May 2020 09:12:13 +1000
Subject: [PATCH 335/502] m68k: mac: Don't send IOP message until channel is
 idle

In the following sequence of calls, iop_do_send() gets called when the
"send" channel is not in the IOP_MSG_IDLE state:

	iop_ism_irq()
		iop_handle_send()
			(msg->handler)()
				iop_send_message()
			iop_do_send()

Avoid this by testing the channel state before calling iop_do_send().

When sending, and iop_send_queue is empty, call iop_do_send() because
the channel is idle. If iop_send_queue is not empty, iop_do_send() will
get called later by iop_handle_send().

Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2")
Signed-off-by: Finn Thain <fthain@telegraphics.com.au>
Tested-by: Stan Johnson <userm57@yahoo.com>
Cc: Joshua Thompson <funaho@jurai.org>
Link: https://lore.kernel.org/r/6d667c39e53865661fa5a48f16829d18ed8abe54.1590880333.git.fthain@telegraphics.com.au
Signed-off-by: Geert Uytterhoeven <geert@linux-m68k.org>
---
 arch/m68k/mac/iop.c | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

diff --git a/arch/m68k/mac/iop.c b/arch/m68k/mac/iop.c
index d3775afb0f07..754f6478c30d 100644
--- a/arch/m68k/mac/iop.c
+++ b/arch/m68k/mac/iop.c
@@ -415,7 +415,8 @@ static void iop_handle_send(uint iop_num, uint chan)
 	msg->status = IOP_MSGSTATUS_UNUSED;
 	msg = msg->next;
 	iop_send_queue[iop_num][chan] = msg;
-	if (msg) iop_do_send(msg);
+	if (msg && iop_readb(iop, IOP_ADDR_SEND_STATE + chan) == IOP_MSG_IDLE)
+		iop_do_send(msg);
 }
 
 /*
@@ -489,16 +490,12 @@ int iop_send_message(uint iop_num, uint chan, void *privdata,
 
 	if (!(q = iop_send_queue[iop_num][chan])) {
 		iop_send_queue[iop_num][chan] = msg;
+		iop_do_send(msg);
 	} else {
 		while (q->next) q = q->next;
 		q->next = msg;
 	}
 
-	if (iop_readb(iop_base[iop_num],
-	    IOP_ADDR_SEND_STATE + chan) == IOP_MSG_IDLE) {
-		iop_do_send(msg);
-	}
-
 	return 0;
 }
 

From 931fc82a6aaf4e2e4a5490addaa6a090d78c24a7 Mon Sep 17 00:00:00 2001
From: Finn Thain <fthain@telegraphics.com.au>
Date: Sun, 31 May 2020 09:12:13 +1000
Subject: [PATCH 336/502] m68k: mac: Fix IOP status/control register writes

When writing values to the IOP status/control register make sure those
values do not have any extraneous bits that will clear interrupt flags.

To place the SCC IOP into bypass mode would be desirable but this is not
achieved by writing IOP_DMAINACTIVE | IOP_RUN | IOP_AUTOINC | IOP_BYPASS
to the control register. Drop this ineffective register write.

Remove the flawed and unused iop_bypass() function. Make use of the
unused iop_stop() function.

Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2")
Signed-off-by: Finn Thain <fthain@telegraphics.com.au>
Tested-by: Stan Johnson <userm57@yahoo.com>
Cc: Joshua Thompson <funaho@jurai.org>
Link: https://lore.kernel.org/r/09bcb7359a1719a18b551ee515da3c4c3cf709e6.1590880333.git.fthain@telegraphics.com.au
Signed-off-by: Geert Uytterhoeven <geert@linux-m68k.org>
---
 arch/m68k/mac/iop.c | 12 +++---------
 1 file changed, 3 insertions(+), 9 deletions(-)

diff --git a/arch/m68k/mac/iop.c b/arch/m68k/mac/iop.c
index 754f6478c30d..bfc8daf50744 100644
--- a/arch/m68k/mac/iop.c
+++ b/arch/m68k/mac/iop.c
@@ -183,7 +183,7 @@ static __inline__ void iop_writeb(volatile struct mac_iop *iop, __u16 addr, __u8
 
 static __inline__ void iop_stop(volatile struct mac_iop *iop)
 {
-	iop->status_ctrl &= ~IOP_RUN;
+	iop->status_ctrl = IOP_AUTOINC;
 }
 
 static __inline__ void iop_start(volatile struct mac_iop *iop)
@@ -191,14 +191,9 @@ static __inline__ void iop_start(volatile struct mac_iop *iop)
 	iop->status_ctrl = IOP_RUN | IOP_AUTOINC;
 }
 
-static __inline__ void iop_bypass(volatile struct mac_iop *iop)
-{
-	iop->status_ctrl |= IOP_BYPASS;
-}
-
 static __inline__ void iop_interrupt(volatile struct mac_iop *iop)
 {
-	iop->status_ctrl |= IOP_IRQ;
+	iop->status_ctrl = IOP_IRQ | IOP_RUN | IOP_AUTOINC;
 }
 
 static int iop_alive(volatile struct mac_iop *iop)
@@ -244,7 +239,6 @@ void __init iop_preinit(void)
 		} else {
 			iop_base[IOP_NUM_SCC] = (struct mac_iop *) SCC_IOP_BASE_QUADRA;
 		}
-		iop_base[IOP_NUM_SCC]->status_ctrl = 0x87;
 		iop_scc_present = 1;
 	} else {
 		iop_base[IOP_NUM_SCC] = NULL;
@@ -256,7 +250,7 @@ void __init iop_preinit(void)
 		} else {
 			iop_base[IOP_NUM_ISM] = (struct mac_iop *) ISM_IOP_BASE_QUADRA;
 		}
-		iop_base[IOP_NUM_ISM]->status_ctrl = 0;
+		iop_stop(iop_base[IOP_NUM_ISM]);
 		iop_ism_present = 1;
 	} else {
 		iop_base[IOP_NUM_ISM] = NULL;

From adc19b2e314b3883a22e4f51654da4e6d8102d5d Mon Sep 17 00:00:00 2001
From: Finn Thain <fthain@telegraphics.com.au>
Date: Sun, 31 May 2020 09:12:13 +1000
Subject: [PATCH 337/502] m68k: mac: Don't send uninitialized data in IOP
 message reply

Clear the message reply before calling iop_complete(). This code path is
not normally executed but should that happen let's arrange for consistent
behaviour from the IOP.

Signed-off-by: Finn Thain <fthain@telegraphics.com.au>
Tested-by: Stan Johnson <userm57@yahoo.com>
Cc: Joshua Thompson <funaho@jurai.org>
Link: https://lore.kernel.org/r/8e35df4d193b082cb6285b1f30c949ff7e30e99e.1590880333.git.fthain@telegraphics.com.au
Signed-off-by: Geert Uytterhoeven <geert@linux-m68k.org>
---
 arch/m68k/mac/iop.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/arch/m68k/mac/iop.c b/arch/m68k/mac/iop.c
index bfc8daf50744..8844963eea75 100644
--- a/arch/m68k/mac/iop.c
+++ b/arch/m68k/mac/iop.c
@@ -449,6 +449,7 @@ static void iop_handle_recv(uint iop_num, uint chan)
 		iop_pr_debug("unclaimed message on iop_num %d chan %d\n",
 		             iop_num, chan);
 		iop_pr_debug("%*ph\n", IOP_MSG_LEN, msg->message);
+		memset(msg->reply, 0, IOP_MSG_LEN);
 		iop_complete_message(msg);
 	}
 }

From 47fbcb9506df7cf02ccae6895be3f76fa5768eb1 Mon Sep 17 00:00:00 2001
From: Finn Thain <fthain@telegraphics.com.au>
Date: Sun, 31 May 2020 09:12:13 +1000
Subject: [PATCH 338/502] m68k: mac: Improve IOP debug messages

Always dump the full message and reply. Avoid printing partial lines
as this output gets mixed up with the output from called functions.
Don't output the state of idle channels.

Signed-off-by: Finn Thain <fthain@telegraphics.com.au>
Tested-by: Stan Johnson <userm57@yahoo.com>
Cc: Joshua Thompson <funaho@jurai.org>
Link: https://lore.kernel.org/r/317909d69244f06581973c5839382f5516cd9a1c.1590880333.git.fthain@telegraphics.com.au
Signed-off-by: Geert Uytterhoeven <geert@linux-m68k.org>
---
 arch/m68k/mac/iop.c | 38 +++++++++++++++++++-------------------
 1 file changed, 19 insertions(+), 19 deletions(-)

diff --git a/arch/m68k/mac/iop.c b/arch/m68k/mac/iop.c
index 8844963eea75..c669a7644301 100644
--- a/arch/m68k/mac/iop.c
+++ b/arch/m68k/mac/iop.c
@@ -347,8 +347,8 @@ void iop_complete_message(struct iop_msg *msg)
 	int chan = msg->channel;
 	int i,offset;
 
-	iop_pr_debug("msg %p iop_num %d channel %d\n", msg, msg->iop_num,
-	             msg->channel);
+	iop_pr_debug("iop_num %d chan %d reply %*ph\n",
+		     msg->iop_num, msg->channel, IOP_MSG_LEN, msg->reply);
 
 	offset = IOP_ADDR_RECV_MSG + (msg->channel * IOP_MSG_LEN);
 
@@ -372,6 +372,9 @@ static void iop_do_send(struct iop_msg *msg)
 	volatile struct mac_iop *iop = iop_base[msg->iop_num];
 	int i,offset;
 
+	iop_pr_debug("iop_num %d chan %d message %*ph\n",
+		     msg->iop_num, msg->channel, IOP_MSG_LEN, msg->message);
+
 	offset = IOP_ADDR_SEND_MSG + (msg->channel * IOP_MSG_LEN);
 
 	for (i = 0 ; i < IOP_MSG_LEN ; i++, offset++) {
@@ -394,8 +397,6 @@ static void iop_handle_send(uint iop_num, uint chan)
 	struct iop_msg *msg;
 	int i,offset;
 
-	iop_pr_debug("iop_num %d chan %d\n", iop_num, chan);
-
 	iop_writeb(iop, IOP_ADDR_SEND_STATE + chan, IOP_MSG_IDLE);
 
 	if (!(msg = iop_send_queue[iop_num][chan])) return;
@@ -405,6 +406,9 @@ static void iop_handle_send(uint iop_num, uint chan)
 	for (i = 0 ; i < IOP_MSG_LEN ; i++, offset++) {
 		msg->reply[i] = iop_readb(iop, offset);
 	}
+	iop_pr_debug("iop_num %d chan %d reply %*ph\n",
+		     iop_num, chan, IOP_MSG_LEN, msg->reply);
+
 	if (msg->handler) (*msg->handler)(msg);
 	msg->status = IOP_MSGSTATUS_UNUSED;
 	msg = msg->next;
@@ -424,8 +428,6 @@ static void iop_handle_recv(uint iop_num, uint chan)
 	int i,offset;
 	struct iop_msg *msg;
 
-	iop_pr_debug("iop_num %d chan %d\n", iop_num, chan);
-
 	msg = iop_get_unused_msg();
 	msg->iop_num = iop_num;
 	msg->channel = chan;
@@ -437,6 +439,8 @@ static void iop_handle_recv(uint iop_num, uint chan)
 	for (i = 0 ; i < IOP_MSG_LEN ; i++, offset++) {
 		msg->message[i] = iop_readb(iop, offset);
 	}
+	iop_pr_debug("iop_num %d chan %d message %*ph\n",
+		     iop_num, chan, IOP_MSG_LEN, msg->message);
 
 	iop_writeb(iop, IOP_ADDR_RECV_STATE + chan, IOP_MSG_RCVD);
 
@@ -446,9 +450,6 @@ static void iop_handle_recv(uint iop_num, uint chan)
 	if (msg->handler) {
 		(*msg->handler)(msg);
 	} else {
-		iop_pr_debug("unclaimed message on iop_num %d chan %d\n",
-		             iop_num, chan);
-		iop_pr_debug("%*ph\n", IOP_MSG_LEN, msg->message);
 		memset(msg->reply, 0, IOP_MSG_LEN);
 		iop_complete_message(msg);
 	}
@@ -559,35 +560,34 @@ irqreturn_t iop_ism_irq(int irq, void *dev_id)
 	int i,state;
 	u8 events = iop->status_ctrl & (IOP_INT0 | IOP_INT1);
 
-	iop_pr_debug("status %02X\n", iop->status_ctrl);
-
 	do {
+		iop_pr_debug("iop_num %d status %02X\n", iop_num,
+			     iop->status_ctrl);
+
 		/* INT0 indicates state change on an outgoing message channel */
 		if (events & IOP_INT0) {
 			iop->status_ctrl = IOP_INT0 | IOP_RUN | IOP_AUTOINC;
-			iop_pr_debug("new status %02X, send states",
-				     iop->status_ctrl);
 			for (i = 0; i < NUM_IOP_CHAN; i++) {
 				state = iop_readb(iop, IOP_ADDR_SEND_STATE + i);
-				iop_pr_cont(" %02X", state);
 				if (state == IOP_MSG_COMPLETE)
 					iop_handle_send(iop_num, i);
+				else if (state != IOP_MSG_IDLE)
+					iop_pr_debug("chan %d send state %02X\n",
+						     i, state);
 			}
-			iop_pr_cont("\n");
 		}
 
 		/* INT1 for incoming messages */
 		if (events & IOP_INT1) {
 			iop->status_ctrl = IOP_INT1 | IOP_RUN | IOP_AUTOINC;
-			iop_pr_debug("new status %02X, recv states",
-				     iop->status_ctrl);
 			for (i = 0; i < NUM_IOP_CHAN; i++) {
 				state = iop_readb(iop, IOP_ADDR_RECV_STATE + i);
-				iop_pr_cont(" %02X", state);
 				if (state == IOP_MSG_NEW)
 					iop_handle_recv(iop_num, i);
+				else if (state != IOP_MSG_IDLE)
+					iop_pr_debug("chan %d recv state %02X\n",
+						     i, state);
 			}
-			iop_pr_cont("\n");
 		}
 
 		events = iop->status_ctrl & (IOP_INT0 | IOP_INT1);

From e3a549487f08f6326b24e92b3d87f9683f1d74a4 Mon Sep 17 00:00:00 2001
From: Masahiro Yamada <masahiroy@kernel.org>
Date: Wed, 17 Jun 2020 12:11:53 +0900
Subject: [PATCH 339/502] m68k: Use CLEAN_FILES to clean up files

The log of 'make ARCH=m68k clean' does not look nice.

$ make ARCH=m68k clean
  CLEAN   arch/m68k/kernel
  [ snip ]
  CLEAN   usr
rm -f vmlinux.gz vmlinux.bz2
  CLEAN   vmlinux.symvers modules.builtin modules.builtin.modinfo

Use CLEAN_FILES to simplify the code, and beautify the log.

Signed-off-by: Masahiro Yamada <masahiroy@kernel.org>
Link: https://lore.kernel.org/r/20200617031153.85858-1-masahiroy@kernel.org
Signed-off-by: Geert Uytterhoeven <geert@linux-m68k.org>
---
 arch/m68k/Makefile | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/arch/m68k/Makefile b/arch/m68k/Makefile
index 71ffaf5f8954..4438ffb4bbe1 100644
--- a/arch/m68k/Makefile
+++ b/arch/m68k/Makefile
@@ -138,8 +138,7 @@ else
 	$(KBZIP2) -1c vmlinux >vmlinux.bz2
 endif
 
-archclean:
-	rm -f vmlinux.gz vmlinux.bz2
+CLEAN_FILES += vmlinux.gz vmlinux.bz2
 
 archheaders:
 	$(Q)$(MAKE) $(build)=arch/m68k/kernel/syscalls all

From 382f429bb559fe991b1ece2e5e58c812e28b3ad8 Mon Sep 17 00:00:00 2001
From: Geert Uytterhoeven <geert@linux-m68k.org>
Date: Mon, 6 Jul 2020 11:34:56 +0200
Subject: [PATCH 340/502] m68k: defconfig: Update defconfigs for v5.8-rc3

  - Re-enable modular build of DES crypto algorithm (no longer
    auto-enabled since commit be01369859b8aa07 ("esp, ah: modernize the
    crypto algorithm selections")),
  - Enable modular build of prime numbers and bitops test modules.

Signed-off-by: Geert Uytterhoeven <geert@linux-m68k.org>
Link: https://lore.kernel.org/r/20200615075458.22088-1-geert@linux-m68k.org
Link: https://lore.kernel.org/r/20200706093456.15641-1-geert@linux-m68k.org
---
 arch/m68k/configs/amiga_defconfig    | 3 +++
 arch/m68k/configs/apollo_defconfig   | 3 +++
 arch/m68k/configs/atari_defconfig    | 3 +++
 arch/m68k/configs/bvme6000_defconfig | 3 +++
 arch/m68k/configs/hp300_defconfig    | 3 +++
 arch/m68k/configs/mac_defconfig      | 3 +++
 arch/m68k/configs/multi_defconfig    | 3 +++
 arch/m68k/configs/mvme147_defconfig  | 3 +++
 arch/m68k/configs/mvme16x_defconfig  | 3 +++
 arch/m68k/configs/q40_defconfig      | 3 +++
 arch/m68k/configs/sun3_defconfig     | 3 +++
 arch/m68k/configs/sun3x_defconfig    | 3 +++
 12 files changed, 36 insertions(+)

diff --git a/arch/m68k/configs/amiga_defconfig b/arch/m68k/configs/amiga_defconfig
index 888b75e7fd79..f9f4fa595e13 100644
--- a/arch/m68k/configs/amiga_defconfig
+++ b/arch/m68k/configs/amiga_defconfig
@@ -594,6 +594,7 @@ CONFIG_CRYPTO_BLOWFISH=m
 CONFIG_CRYPTO_CAMELLIA=m
 CONFIG_CRYPTO_CAST5=m
 CONFIG_CRYPTO_CAST6=m
+CONFIG_CRYPTO_DES=m
 CONFIG_CRYPTO_FCRYPT=m
 CONFIG_CRYPTO_KHAZAD=m
 CONFIG_CRYPTO_SALSA20=m
@@ -615,6 +616,7 @@ CONFIG_CRYPTO_USER_API_SKCIPHER=m
 CONFIG_CRYPTO_USER_API_RNG=m
 CONFIG_CRYPTO_USER_API_AEAD=m
 # CONFIG_CRYPTO_HW is not set
+CONFIG_PRIME_NUMBERS=m
 CONFIG_CRC32_SELFTEST=m
 CONFIG_CRC64=m
 CONFIG_XZ_DEC_TEST=m
@@ -643,6 +645,7 @@ CONFIG_TEST_OVERFLOW=m
 CONFIG_TEST_RHASHTABLE=m
 CONFIG_TEST_HASH=m
 CONFIG_TEST_IDA=m
+CONFIG_TEST_BITOPS=m
 CONFIG_TEST_VMALLOC=m
 CONFIG_TEST_USER_COPY=m
 CONFIG_TEST_BPF=m
diff --git a/arch/m68k/configs/apollo_defconfig b/arch/m68k/configs/apollo_defconfig
index 45303846b659..f4828e86d547 100644
--- a/arch/m68k/configs/apollo_defconfig
+++ b/arch/m68k/configs/apollo_defconfig
@@ -550,6 +550,7 @@ CONFIG_CRYPTO_BLOWFISH=m
 CONFIG_CRYPTO_CAMELLIA=m
 CONFIG_CRYPTO_CAST5=m
 CONFIG_CRYPTO_CAST6=m
+CONFIG_CRYPTO_DES=m
 CONFIG_CRYPTO_FCRYPT=m
 CONFIG_CRYPTO_KHAZAD=m
 CONFIG_CRYPTO_SALSA20=m
@@ -571,6 +572,7 @@ CONFIG_CRYPTO_USER_API_SKCIPHER=m
 CONFIG_CRYPTO_USER_API_RNG=m
 CONFIG_CRYPTO_USER_API_AEAD=m
 # CONFIG_CRYPTO_HW is not set
+CONFIG_PRIME_NUMBERS=m
 CONFIG_CRC32_SELFTEST=m
 CONFIG_CRC64=m
 CONFIG_XZ_DEC_TEST=m
@@ -599,6 +601,7 @@ CONFIG_TEST_OVERFLOW=m
 CONFIG_TEST_RHASHTABLE=m
 CONFIG_TEST_HASH=m
 CONFIG_TEST_IDA=m
+CONFIG_TEST_BITOPS=m
 CONFIG_TEST_VMALLOC=m
 CONFIG_TEST_USER_COPY=m
 CONFIG_TEST_BPF=m
diff --git a/arch/m68k/configs/atari_defconfig b/arch/m68k/configs/atari_defconfig
index de824c1bc3d3..e7911f141de1 100644
--- a/arch/m68k/configs/atari_defconfig
+++ b/arch/m68k/configs/atari_defconfig
@@ -572,6 +572,7 @@ CONFIG_CRYPTO_BLOWFISH=m
 CONFIG_CRYPTO_CAMELLIA=m
 CONFIG_CRYPTO_CAST5=m
 CONFIG_CRYPTO_CAST6=m
+CONFIG_CRYPTO_DES=m
 CONFIG_CRYPTO_FCRYPT=m
 CONFIG_CRYPTO_KHAZAD=m
 CONFIG_CRYPTO_SALSA20=m
@@ -593,6 +594,7 @@ CONFIG_CRYPTO_USER_API_SKCIPHER=m
 CONFIG_CRYPTO_USER_API_RNG=m
 CONFIG_CRYPTO_USER_API_AEAD=m
 # CONFIG_CRYPTO_HW is not set
+CONFIG_PRIME_NUMBERS=m
 CONFIG_CRC32_SELFTEST=m
 CONFIG_CRC64=m
 CONFIG_XZ_DEC_TEST=m
@@ -621,6 +623,7 @@ CONFIG_TEST_OVERFLOW=m
 CONFIG_TEST_RHASHTABLE=m
 CONFIG_TEST_HASH=m
 CONFIG_TEST_IDA=m
+CONFIG_TEST_BITOPS=m
 CONFIG_TEST_VMALLOC=m
 CONFIG_TEST_USER_COPY=m
 CONFIG_TEST_BPF=m
diff --git a/arch/m68k/configs/bvme6000_defconfig b/arch/m68k/configs/bvme6000_defconfig
index 071839ca6a59..d574e438e6db 100644
--- a/arch/m68k/configs/bvme6000_defconfig
+++ b/arch/m68k/configs/bvme6000_defconfig
@@ -543,6 +543,7 @@ CONFIG_CRYPTO_BLOWFISH=m
 CONFIG_CRYPTO_CAMELLIA=m
 CONFIG_CRYPTO_CAST5=m
 CONFIG_CRYPTO_CAST6=m
+CONFIG_CRYPTO_DES=m
 CONFIG_CRYPTO_FCRYPT=m
 CONFIG_CRYPTO_KHAZAD=m
 CONFIG_CRYPTO_SALSA20=m
@@ -564,6 +565,7 @@ CONFIG_CRYPTO_USER_API_SKCIPHER=m
 CONFIG_CRYPTO_USER_API_RNG=m
 CONFIG_CRYPTO_USER_API_AEAD=m
 # CONFIG_CRYPTO_HW is not set
+CONFIG_PRIME_NUMBERS=m
 CONFIG_CRC32_SELFTEST=m
 CONFIG_CRC64=m
 CONFIG_XZ_DEC_TEST=m
@@ -592,6 +594,7 @@ CONFIG_TEST_OVERFLOW=m
 CONFIG_TEST_RHASHTABLE=m
 CONFIG_TEST_HASH=m
 CONFIG_TEST_IDA=m
+CONFIG_TEST_BITOPS=m
 CONFIG_TEST_VMALLOC=m
 CONFIG_TEST_USER_COPY=m
 CONFIG_TEST_BPF=m
diff --git a/arch/m68k/configs/hp300_defconfig b/arch/m68k/configs/hp300_defconfig
index 37ac7b019ec1..c7ce206e6138 100644
--- a/arch/m68k/configs/hp300_defconfig
+++ b/arch/m68k/configs/hp300_defconfig
@@ -552,6 +552,7 @@ CONFIG_CRYPTO_BLOWFISH=m
 CONFIG_CRYPTO_CAMELLIA=m
 CONFIG_CRYPTO_CAST5=m
 CONFIG_CRYPTO_CAST6=m
+CONFIG_CRYPTO_DES=m
 CONFIG_CRYPTO_FCRYPT=m
 CONFIG_CRYPTO_KHAZAD=m
 CONFIG_CRYPTO_SALSA20=m
@@ -573,6 +574,7 @@ CONFIG_CRYPTO_USER_API_SKCIPHER=m
 CONFIG_CRYPTO_USER_API_RNG=m
 CONFIG_CRYPTO_USER_API_AEAD=m
 # CONFIG_CRYPTO_HW is not set
+CONFIG_PRIME_NUMBERS=m
 CONFIG_CRC32_SELFTEST=m
 CONFIG_CRC64=m
 CONFIG_XZ_DEC_TEST=m
@@ -601,6 +603,7 @@ CONFIG_TEST_OVERFLOW=m
 CONFIG_TEST_RHASHTABLE=m
 CONFIG_TEST_HASH=m
 CONFIG_TEST_IDA=m
+CONFIG_TEST_BITOPS=m
 CONFIG_TEST_VMALLOC=m
 CONFIG_TEST_USER_COPY=m
 CONFIG_TEST_BPF=m
diff --git a/arch/m68k/configs/mac_defconfig b/arch/m68k/configs/mac_defconfig
index 608779866260..522dcf624aa5 100644
--- a/arch/m68k/configs/mac_defconfig
+++ b/arch/m68k/configs/mac_defconfig
@@ -574,6 +574,7 @@ CONFIG_CRYPTO_BLOWFISH=m
 CONFIG_CRYPTO_CAMELLIA=m
 CONFIG_CRYPTO_CAST5=m
 CONFIG_CRYPTO_CAST6=m
+CONFIG_CRYPTO_DES=m
 CONFIG_CRYPTO_FCRYPT=m
 CONFIG_CRYPTO_KHAZAD=m
 CONFIG_CRYPTO_SALSA20=m
@@ -595,6 +596,7 @@ CONFIG_CRYPTO_USER_API_SKCIPHER=m
 CONFIG_CRYPTO_USER_API_RNG=m
 CONFIG_CRYPTO_USER_API_AEAD=m
 # CONFIG_CRYPTO_HW is not set
+CONFIG_PRIME_NUMBERS=m
 CONFIG_CRC32_SELFTEST=m
 CONFIG_CRC64=m
 CONFIG_XZ_DEC_TEST=m
@@ -623,6 +625,7 @@ CONFIG_TEST_OVERFLOW=m
 CONFIG_TEST_RHASHTABLE=m
 CONFIG_TEST_HASH=m
 CONFIG_TEST_IDA=m
+CONFIG_TEST_BITOPS=m
 CONFIG_TEST_VMALLOC=m
 CONFIG_TEST_USER_COPY=m
 CONFIG_TEST_BPF=m
diff --git a/arch/m68k/configs/multi_defconfig b/arch/m68k/configs/multi_defconfig
index 0abb53c38c20..2433409f4369 100644
--- a/arch/m68k/configs/multi_defconfig
+++ b/arch/m68k/configs/multi_defconfig
@@ -660,6 +660,7 @@ CONFIG_CRYPTO_BLOWFISH=m
 CONFIG_CRYPTO_CAMELLIA=m
 CONFIG_CRYPTO_CAST5=m
 CONFIG_CRYPTO_CAST6=m
+CONFIG_CRYPTO_DES=m
 CONFIG_CRYPTO_FCRYPT=m
 CONFIG_CRYPTO_KHAZAD=m
 CONFIG_CRYPTO_SALSA20=m
@@ -681,6 +682,7 @@ CONFIG_CRYPTO_USER_API_SKCIPHER=m
 CONFIG_CRYPTO_USER_API_RNG=m
 CONFIG_CRYPTO_USER_API_AEAD=m
 # CONFIG_CRYPTO_HW is not set
+CONFIG_PRIME_NUMBERS=m
 CONFIG_CRC32_SELFTEST=m
 CONFIG_CRC64=m
 CONFIG_XZ_DEC_TEST=m
@@ -709,6 +711,7 @@ CONFIG_TEST_OVERFLOW=m
 CONFIG_TEST_RHASHTABLE=m
 CONFIG_TEST_HASH=m
 CONFIG_TEST_IDA=m
+CONFIG_TEST_BITOPS=m
 CONFIG_TEST_VMALLOC=m
 CONFIG_TEST_USER_COPY=m
 CONFIG_TEST_BPF=m
diff --git a/arch/m68k/configs/mvme147_defconfig b/arch/m68k/configs/mvme147_defconfig
index cb14c234d3ad..5568aa7d9d41 100644
--- a/arch/m68k/configs/mvme147_defconfig
+++ b/arch/m68k/configs/mvme147_defconfig
@@ -542,6 +542,7 @@ CONFIG_CRYPTO_BLOWFISH=m
 CONFIG_CRYPTO_CAMELLIA=m
 CONFIG_CRYPTO_CAST5=m
 CONFIG_CRYPTO_CAST6=m
+CONFIG_CRYPTO_DES=m
 CONFIG_CRYPTO_FCRYPT=m
 CONFIG_CRYPTO_KHAZAD=m
 CONFIG_CRYPTO_SALSA20=m
@@ -563,6 +564,7 @@ CONFIG_CRYPTO_USER_API_SKCIPHER=m
 CONFIG_CRYPTO_USER_API_RNG=m
 CONFIG_CRYPTO_USER_API_AEAD=m
 # CONFIG_CRYPTO_HW is not set
+CONFIG_PRIME_NUMBERS=m
 CONFIG_CRC32_SELFTEST=m
 CONFIG_CRC64=m
 CONFIG_XZ_DEC_TEST=m
@@ -591,6 +593,7 @@ CONFIG_TEST_OVERFLOW=m
 CONFIG_TEST_RHASHTABLE=m
 CONFIG_TEST_HASH=m
 CONFIG_TEST_IDA=m
+CONFIG_TEST_BITOPS=m
 CONFIG_TEST_VMALLOC=m
 CONFIG_TEST_USER_COPY=m
 CONFIG_TEST_BPF=m
diff --git a/arch/m68k/configs/mvme16x_defconfig b/arch/m68k/configs/mvme16x_defconfig
index e8a1920aded7..5b1e72ce53f8 100644
--- a/arch/m68k/configs/mvme16x_defconfig
+++ b/arch/m68k/configs/mvme16x_defconfig
@@ -543,6 +543,7 @@ CONFIG_CRYPTO_BLOWFISH=m
 CONFIG_CRYPTO_CAMELLIA=m
 CONFIG_CRYPTO_CAST5=m
 CONFIG_CRYPTO_CAST6=m
+CONFIG_CRYPTO_DES=m
 CONFIG_CRYPTO_FCRYPT=m
 CONFIG_CRYPTO_KHAZAD=m
 CONFIG_CRYPTO_SALSA20=m
@@ -564,6 +565,7 @@ CONFIG_CRYPTO_USER_API_SKCIPHER=m
 CONFIG_CRYPTO_USER_API_RNG=m
 CONFIG_CRYPTO_USER_API_AEAD=m
 # CONFIG_CRYPTO_HW is not set
+CONFIG_PRIME_NUMBERS=m
 CONFIG_CRC32_SELFTEST=m
 CONFIG_CRC64=m
 CONFIG_XZ_DEC_TEST=m
@@ -592,6 +594,7 @@ CONFIG_TEST_OVERFLOW=m
 CONFIG_TEST_RHASHTABLE=m
 CONFIG_TEST_HASH=m
 CONFIG_TEST_IDA=m
+CONFIG_TEST_BITOPS=m
 CONFIG_TEST_VMALLOC=m
 CONFIG_TEST_USER_COPY=m
 CONFIG_TEST_BPF=m
diff --git a/arch/m68k/configs/q40_defconfig b/arch/m68k/configs/q40_defconfig
index 2cbf416fc725..c3a3dcf30fb9 100644
--- a/arch/m68k/configs/q40_defconfig
+++ b/arch/m68k/configs/q40_defconfig
@@ -561,6 +561,7 @@ CONFIG_CRYPTO_BLOWFISH=m
 CONFIG_CRYPTO_CAMELLIA=m
 CONFIG_CRYPTO_CAST5=m
 CONFIG_CRYPTO_CAST6=m
+CONFIG_CRYPTO_DES=m
 CONFIG_CRYPTO_FCRYPT=m
 CONFIG_CRYPTO_KHAZAD=m
 CONFIG_CRYPTO_SALSA20=m
@@ -582,6 +583,7 @@ CONFIG_CRYPTO_USER_API_SKCIPHER=m
 CONFIG_CRYPTO_USER_API_RNG=m
 CONFIG_CRYPTO_USER_API_AEAD=m
 # CONFIG_CRYPTO_HW is not set
+CONFIG_PRIME_NUMBERS=m
 CONFIG_CRC32_SELFTEST=m
 CONFIG_CRC64=m
 CONFIG_XZ_DEC_TEST=m
@@ -610,6 +612,7 @@ CONFIG_TEST_OVERFLOW=m
 CONFIG_TEST_RHASHTABLE=m
 CONFIG_TEST_HASH=m
 CONFIG_TEST_IDA=m
+CONFIG_TEST_BITOPS=m
 CONFIG_TEST_VMALLOC=m
 CONFIG_TEST_USER_COPY=m
 CONFIG_TEST_BPF=m
diff --git a/arch/m68k/configs/sun3_defconfig b/arch/m68k/configs/sun3_defconfig
index fed3cc7abcc4..3c00e52f1bf0 100644
--- a/arch/m68k/configs/sun3_defconfig
+++ b/arch/m68k/configs/sun3_defconfig
@@ -545,6 +545,7 @@ CONFIG_CRYPTO_BLOWFISH=m
 CONFIG_CRYPTO_CAMELLIA=m
 CONFIG_CRYPTO_CAST5=m
 CONFIG_CRYPTO_CAST6=m
+CONFIG_CRYPTO_DES=m
 CONFIG_CRYPTO_FCRYPT=m
 CONFIG_CRYPTO_KHAZAD=m
 CONFIG_CRYPTO_SALSA20=m
@@ -566,6 +567,7 @@ CONFIG_CRYPTO_USER_API_SKCIPHER=m
 CONFIG_CRYPTO_USER_API_RNG=m
 CONFIG_CRYPTO_USER_API_AEAD=m
 # CONFIG_CRYPTO_HW is not set
+CONFIG_PRIME_NUMBERS=m
 CONFIG_CRC32_SELFTEST=m
 CONFIG_CRC64=m
 CONFIG_XZ_DEC_TEST=m
@@ -593,6 +595,7 @@ CONFIG_TEST_OVERFLOW=m
 CONFIG_TEST_RHASHTABLE=m
 CONFIG_TEST_HASH=m
 CONFIG_TEST_IDA=m
+CONFIG_TEST_BITOPS=m
 CONFIG_TEST_VMALLOC=m
 CONFIG_TEST_USER_COPY=m
 CONFIG_TEST_BPF=m
diff --git a/arch/m68k/configs/sun3x_defconfig b/arch/m68k/configs/sun3x_defconfig
index 0954fde256e6..241242d73cbd 100644
--- a/arch/m68k/configs/sun3x_defconfig
+++ b/arch/m68k/configs/sun3x_defconfig
@@ -544,6 +544,7 @@ CONFIG_CRYPTO_BLOWFISH=m
 CONFIG_CRYPTO_CAMELLIA=m
 CONFIG_CRYPTO_CAST5=m
 CONFIG_CRYPTO_CAST6=m
+CONFIG_CRYPTO_DES=m
 CONFIG_CRYPTO_FCRYPT=m
 CONFIG_CRYPTO_KHAZAD=m
 CONFIG_CRYPTO_SALSA20=m
@@ -565,6 +566,7 @@ CONFIG_CRYPTO_USER_API_SKCIPHER=m
 CONFIG_CRYPTO_USER_API_RNG=m
 CONFIG_CRYPTO_USER_API_AEAD=m
 # CONFIG_CRYPTO_HW is not set
+CONFIG_PRIME_NUMBERS=m
 CONFIG_CRC32_SELFTEST=m
 CONFIG_CRC64=m
 CONFIG_XZ_DEC_TEST=m
@@ -593,6 +595,7 @@ CONFIG_TEST_OVERFLOW=m
 CONFIG_TEST_RHASHTABLE=m
 CONFIG_TEST_HASH=m
 CONFIG_TEST_IDA=m
+CONFIG_TEST_BITOPS=m
 CONFIG_TEST_VMALLOC=m
 CONFIG_TEST_USER_COPY=m
 CONFIG_TEST_BPF=m

From f011856ce7b600fdc2d1102d56873b787ff6d1bb Mon Sep 17 00:00:00 2001
From: Jay Chen <jkchen@linux.alibaba.com>
Date: Mon, 6 Jul 2020 19:22:45 +0800
Subject: [PATCH 341/502] perf/smmuv3: To simplify code for ioremap page in
 pmcg

Use the devm_platform_get_and_ioremap_resource to simplify the code
a bit.

Signed-off-by: Jay Chen <jkchen@linux.alibaba.com>
Link: https://lore.kernel.org/r/20200706112246.92220-2-jkchen@linux.alibaba.com
Signed-off-by: Will Deacon <will@kernel.org>
---
 drivers/perf/arm_smmuv3_pmu.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/drivers/perf/arm_smmuv3_pmu.c b/drivers/perf/arm_smmuv3_pmu.c
index 48e28ef93a70..2d09f3e47d12 100644
--- a/drivers/perf/arm_smmuv3_pmu.c
+++ b/drivers/perf/arm_smmuv3_pmu.c
@@ -755,8 +755,7 @@ static int smmu_pmu_probe(struct platform_device *pdev)
 		.capabilities	= PERF_PMU_CAP_NO_EXCLUDE,
 	};
 
-	res_0 = platform_get_resource(pdev, IORESOURCE_MEM, 0);
-	smmu_pmu->reg_base = devm_ioremap_resource(dev, res_0);
+	smmu_pmu->reg_base = devm_platform_get_and_ioremap_resource(pdev, 0, &res_0);
 	if (IS_ERR(smmu_pmu->reg_base))
 		return PTR_ERR(smmu_pmu->reg_base);
 

From 1583052d111f8ea43f9954c5e749164fd2b954af Mon Sep 17 00:00:00 2001
From: Ard Biesheuvel <ardb@kernel.org>
Date: Fri, 26 Jun 2020 17:58:31 +0200
Subject: [PATCH 342/502] arm64/acpi: disallow AML memory opregions to access
 kernel memory

AML uses SystemMemory opregions to allow AML handlers to access MMIO
registers of, e.g., GPIO controllers, or access reserved regions of
memory that are owned by the firmware.

Currently, we also allow AML access to memory that is owned by the
kernel and mapped via the linear region, which does not seem to be
supported by a valid use case, and exposes the kernel's internal
state to AML methods that may be buggy and exploitable.

On arm64, ACPI support requires booting in EFI mode, and so we can cross
reference the requested region against the EFI memory map, rather than
just do a minimal check on the first page. So let's only permit regions
to be remapped by the ACPI core if
- they don't appear in the EFI memory map at all (which is the case for
  most MMIO), or
- they are covered by a single region in the EFI memory map, which is not
  of a type that describes memory that is given to the kernel at boot.

Reported-by: Jason A. Donenfeld <Jason@zx2c4.com>
Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
Acked-by: Lorenzo Pieralisi <lorenzo.pieralisi@arm.com>
Link: https://lore.kernel.org/r/20200626155832.2323789-2-ardb@kernel.org
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 arch/arm64/include/asm/acpi.h | 15 +-------
 arch/arm64/kernel/acpi.c      | 66 +++++++++++++++++++++++++++++++++++
 2 files changed, 67 insertions(+), 14 deletions(-)

diff --git a/arch/arm64/include/asm/acpi.h b/arch/arm64/include/asm/acpi.h
index a45366c3909b..bd68e1b7f29f 100644
--- a/arch/arm64/include/asm/acpi.h
+++ b/arch/arm64/include/asm/acpi.h
@@ -47,20 +47,7 @@
 pgprot_t __acpi_get_mem_attribute(phys_addr_t addr);
 
 /* ACPI table mapping after acpi_permanent_mmap is set */
-static inline void __iomem *acpi_os_ioremap(acpi_physical_address phys,
-					    acpi_size size)
-{
-	/* For normal memory we already have a cacheable mapping. */
-	if (memblock_is_map_memory(phys))
-		return (void __iomem *)__phys_to_virt(phys);
-
-	/*
-	 * We should still honor the memory's attribute here because
-	 * crash dump kernel possibly excludes some ACPI (reclaim)
-	 * regions from memblock list.
-	 */
-	return __ioremap(phys, size, __acpi_get_mem_attribute(phys));
-}
+void __iomem *acpi_os_ioremap(acpi_physical_address phys, acpi_size size);
 #define acpi_os_ioremap acpi_os_ioremap
 
 typedef u64 phys_cpuid_t;
diff --git a/arch/arm64/kernel/acpi.c b/arch/arm64/kernel/acpi.c
index a7586a4db142..01b861e225b0 100644
--- a/arch/arm64/kernel/acpi.c
+++ b/arch/arm64/kernel/acpi.c
@@ -261,6 +261,72 @@ pgprot_t __acpi_get_mem_attribute(phys_addr_t addr)
 	return __pgprot(PROT_DEVICE_nGnRnE);
 }
 
+void __iomem *acpi_os_ioremap(acpi_physical_address phys, acpi_size size)
+{
+	efi_memory_desc_t *md, *region = NULL;
+	pgprot_t prot;
+
+	if (WARN_ON_ONCE(!efi_enabled(EFI_MEMMAP)))
+		return NULL;
+
+	for_each_efi_memory_desc(md) {
+		u64 end = md->phys_addr + (md->num_pages << EFI_PAGE_SHIFT);
+
+		if (phys < md->phys_addr || phys >= end)
+			continue;
+
+		if (phys + size > end) {
+			pr_warn(FW_BUG "requested region covers multiple EFI memory regions\n");
+			return NULL;
+		}
+		region = md;
+		break;
+	}
+
+	/*
+	 * It is fine for AML to remap regions that are not represented in the
+	 * EFI memory map at all, as it only describes normal memory, and MMIO
+	 * regions that require a virtual mapping to make them accessible to
+	 * the EFI runtime services.
+	 */
+	prot = __pgprot(PROT_DEVICE_nGnRnE);
+	if (region) {
+		switch (region->type) {
+		case EFI_LOADER_CODE:
+		case EFI_LOADER_DATA:
+		case EFI_BOOT_SERVICES_CODE:
+		case EFI_BOOT_SERVICES_DATA:
+		case EFI_CONVENTIONAL_MEMORY:
+		case EFI_PERSISTENT_MEMORY:
+			pr_warn(FW_BUG "requested region covers kernel memory @ %pa\n", &phys);
+			return NULL;
+
+		case EFI_ACPI_RECLAIM_MEMORY:
+			/*
+			 * ACPI reclaim memory is used to pass firmware tables
+			 * and other data that is intended for consumption by
+			 * the OS only, which may decide it wants to reclaim
+			 * that memory and use it for something else. We never
+			 * do that, but we usually add it to the linear map
+			 * anyway, in which case we should use the existing
+			 * mapping.
+			 */
+			if (memblock_is_map_memory(phys))
+				return (void __iomem *)__phys_to_virt(phys);
+			/* fall through */
+
+		default:
+			if (region->attribute & EFI_MEMORY_WB)
+				prot = PAGE_KERNEL;
+			else if (region->attribute & EFI_MEMORY_WT)
+				prot = __pgprot(PROT_NORMAL_WT);
+			else if (region->attribute & EFI_MEMORY_WC)
+				prot = __pgprot(PROT_NORMAL_NC);
+		}
+	}
+	return __ioremap(phys, size, prot);
+}
+
 /*
  * Claim Synchronous External Aborts as a firmware first notification.
  *

From 325f5585ec36953a3fe2e000451f690440fe1bf5 Mon Sep 17 00:00:00 2001
From: Ard Biesheuvel <ardb@kernel.org>
Date: Fri, 26 Jun 2020 17:58:32 +0200
Subject: [PATCH 343/502] arm64/acpi: disallow writeable AML opregion mapping
 for EFI code regions

Given that the contents of EFI runtime code and data regions are
provided by the firmware, as well as the DSDT, it is not unimaginable
that AML code exists today that accesses EFI runtime code regions using
a SystemMemory OpRegion. There is nothing fundamentally wrong with that,
but since we take great care to ensure that executable code is never
mapped writeable and executable at the same time, we should not permit
AML to create writable mapping.

Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
Acked-by: Lorenzo Pieralisi <lorenzo.pieralisi@arm.com>
Link: https://lore.kernel.org/r/20200626155832.2323789-3-ardb@kernel.org
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 arch/arm64/kernel/acpi.c | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/arch/arm64/kernel/acpi.c b/arch/arm64/kernel/acpi.c
index 01b861e225b0..455966401102 100644
--- a/arch/arm64/kernel/acpi.c
+++ b/arch/arm64/kernel/acpi.c
@@ -301,6 +301,15 @@ void __iomem *acpi_os_ioremap(acpi_physical_address phys, acpi_size size)
 			pr_warn(FW_BUG "requested region covers kernel memory @ %pa\n", &phys);
 			return NULL;
 
+		case EFI_RUNTIME_SERVICES_CODE:
+			/*
+			 * This would be unusual, but not problematic per se,
+			 * as long as we take care not to create a writable
+			 * mapping for executable code.
+			 */
+			prot = PAGE_KERNEL_RO;
+			break;
+
 		case EFI_ACPI_RECLAIM_MEMORY:
 			/*
 			 * ACPI reclaim memory is used to pass firmware tables

From 0de674afe83cb23676ec391470251aaa9700f21a Mon Sep 17 00:00:00 2001
From: Mark Brown <broonie@kernel.org>
Date: Fri, 10 Jul 2020 19:24:02 +0100
Subject: [PATCH 344/502] arm64: stacktrace: Move export for
 save_stack_trace_tsk()

Due to refactoring way back in bb53c820c5b0f1 ("arm64: stacktrace: avoid
listing stacktrace functions in stacktrace") the EXPORT_SYMBOL_GPL() for
save_stack_trace_tsk() is at the end of __save_stack_trace() rather than
the function it exports. Move it to the expected location.

Signed-off-by: Mark Brown <broonie@kernel.org>
Acked-by: Will Deacon <will@kernel.org>
Link: https://lore.kernel.org/r/20200710182402.50473-1-broonie@kernel.org
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 arch/arm64/kernel/stacktrace.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/arm64/kernel/stacktrace.c b/arch/arm64/kernel/stacktrace.c
index 139679c745bf..2dd8e3b8b94b 100644
--- a/arch/arm64/kernel/stacktrace.c
+++ b/arch/arm64/kernel/stacktrace.c
@@ -199,12 +199,12 @@ static noinline void __save_stack_trace(struct task_struct *tsk,
 
 	put_task_stack(tsk);
 }
-EXPORT_SYMBOL_GPL(save_stack_trace_tsk);
 
 void save_stack_trace_tsk(struct task_struct *tsk, struct stack_trace *trace)
 {
 	__save_stack_trace(tsk, trace, 1);
 }
+EXPORT_SYMBOL_GPL(save_stack_trace_tsk);
 
 void save_stack_trace(struct stack_trace *trace)
 {

From abb7962adc80ab4f4313e8a065302525b6a9c2dc Mon Sep 17 00:00:00 2001
From: Anshuman Khandual <anshuman.khandual@arm.com>
Date: Wed, 1 Jul 2020 10:12:01 +0530
Subject: [PATCH 345/502] arm64/hugetlb: Reserve CMA areas for gigantic pages
 on 16K and 64K configs

Currently 'hugetlb_cma=' command line argument does not create CMA area on
ARM64_16K_PAGES and ARM64_64K_PAGES based platforms. Instead, it just ends
up with the following warning message. Reason being, hugetlb_cma_reserve()
never gets called for these huge page sizes.

[   64.255669] hugetlb_cma: the option isn't supported by current arch

This enables CMA areas reservation on ARM64_16K_PAGES and ARM64_64K_PAGES
configs by defining an unified arm64_hugetlb_cma_reseve() that is wrapped
in CONFIG_CMA. Call site for arm64_hugetlb_cma_reserve() is also protected
as <asm/hugetlb.h> is conditionally included and hence cannot contain stub
for the inverse config i.e !(CONFIG_HUGETLB_PAGE && CONFIG_CMA).

Signed-off-by: Anshuman Khandual <anshuman.khandual@arm.com>
Cc: Will Deacon <will@kernel.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Mike Kravetz <mike.kravetz@oracle.com>
Cc: Barry Song <song.bao.hua@hisilicon.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: linux-arm-kernel@lists.infradead.org
Cc: linux-kernel@vger.kernel.org
Link: https://lore.kernel.org/r/1593578521-24672-1-git-send-email-anshuman.khandual@arm.com
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 arch/arm64/include/asm/hugetlb.h |  2 ++
 arch/arm64/mm/hugetlbpage.c      | 38 ++++++++++++++++++++++++++++++++
 arch/arm64/mm/init.c             |  4 ++--
 3 files changed, 42 insertions(+), 2 deletions(-)

diff --git a/arch/arm64/include/asm/hugetlb.h b/arch/arm64/include/asm/hugetlb.h
index 94ba0c5bced2..5abf91e3494c 100644
--- a/arch/arm64/include/asm/hugetlb.h
+++ b/arch/arm64/include/asm/hugetlb.h
@@ -49,6 +49,8 @@ extern void set_huge_swap_pte_at(struct mm_struct *mm, unsigned long addr,
 				 pte_t *ptep, pte_t pte, unsigned long sz);
 #define set_huge_swap_pte_at set_huge_swap_pte_at
 
+void __init arm64_hugetlb_cma_reserve(void);
+
 #include <asm-generic/hugetlb.h>
 
 #endif /* __ASM_HUGETLB_H */
diff --git a/arch/arm64/mm/hugetlbpage.c b/arch/arm64/mm/hugetlbpage.c
index c79084739096..aa421bf4956e 100644
--- a/arch/arm64/mm/hugetlbpage.c
+++ b/arch/arm64/mm/hugetlbpage.c
@@ -19,6 +19,44 @@
 #include <asm/tlbflush.h>
 #include <asm/pgalloc.h>
 
+/*
+ * HugeTLB Support Matrix
+ *
+ * ---------------------------------------------------
+ * | Page Size | CONT PTE |  PMD  | CONT PMD |  PUD  |
+ * ---------------------------------------------------
+ * |     4K    |   64K    |   2M  |    32M   |   1G  |
+ * |    16K    |    2M    |  32M  |     1G   |       |
+ * |    64K    |    2M    | 512M  |    16G   |       |
+ * ---------------------------------------------------
+ */
+
+/*
+ * Reserve CMA areas for the largest supported gigantic
+ * huge page when requested. Any other smaller gigantic
+ * huge pages could still be served from those areas.
+ */
+#ifdef CONFIG_CMA
+void __init arm64_hugetlb_cma_reserve(void)
+{
+	int order;
+
+#ifdef CONFIG_ARM64_4K_PAGES
+	order = PUD_SHIFT - PAGE_SHIFT;
+#else
+	order = CONT_PMD_SHIFT + PMD_SHIFT - PAGE_SHIFT;
+#endif
+	/*
+	 * HugeTLB CMA reservation is required for gigantic
+	 * huge pages which could not be allocated via the
+	 * page allocator. Just warn if there is any change
+	 * breaking this assumption.
+	 */
+	WARN_ON(order <= MAX_ORDER);
+	hugetlb_cma_reserve(order);
+}
+#endif /* CONFIG_CMA */
+
 #ifdef CONFIG_ARCH_ENABLE_HUGEPAGE_MIGRATION
 bool arch_hugetlb_migration_supported(struct hstate *h)
 {
diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c
index 6c3eb424c613..f8c19c6c8e71 100644
--- a/arch/arm64/mm/init.c
+++ b/arch/arm64/mm/init.c
@@ -425,8 +425,8 @@ void __init bootmem_init(void)
 	 * initialize node_online_map that gets used in hugetlb_cma_reserve()
 	 * while allocating required CMA size across online nodes.
 	 */
-#ifdef CONFIG_ARM64_4K_PAGES
-	hugetlb_cma_reserve(PUD_SHIFT - PAGE_SHIFT);
+#if defined(CONFIG_HUGETLB_PAGE) && defined(CONFIG_CMA)
+	arm64_hugetlb_cma_reserve();
 #endif
 
 	/*

From b620ba54547cd0f98e35c1be102eec2cc25fda5d Mon Sep 17 00:00:00 2001
From: Zhenyu Ye <yezhenyu2@huawei.com>
Date: Wed, 15 Jul 2020 15:19:43 +0800
Subject: [PATCH 346/502] arm64: tlb: Detect the ARMv8.4 TLBI RANGE feature

ARMv8.4-TLBI provides TLBI invalidation instruction that apply to a
range of input addresses. This patch detect this feature.

Signed-off-by: Zhenyu Ye <yezhenyu2@huawei.com>
Link: https://lore.kernel.org/r/20200715071945.897-2-yezhenyu2@huawei.com
[catalin.marinas@arm.com: some renaming for consistency]
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 arch/arm64/include/asm/cpucaps.h |  3 ++-
 arch/arm64/include/asm/sysreg.h  |  3 +++
 arch/arm64/kernel/cpufeature.c   | 10 ++++++++++
 3 files changed, 15 insertions(+), 1 deletion(-)

diff --git a/arch/arm64/include/asm/cpucaps.h b/arch/arm64/include/asm/cpucaps.h
index d44ba903d11d..07b643a70710 100644
--- a/arch/arm64/include/asm/cpucaps.h
+++ b/arch/arm64/include/asm/cpucaps.h
@@ -63,7 +63,8 @@
 #define ARM64_HAS_32BIT_EL1			53
 #define ARM64_BTI				54
 #define ARM64_HAS_ARMv8_4_TTL			55
+#define ARM64_HAS_TLB_RANGE			56
 
-#define ARM64_NCAPS				56
+#define ARM64_NCAPS				57
 
 #endif /* __ASM_CPUCAPS_H */
diff --git a/arch/arm64/include/asm/sysreg.h b/arch/arm64/include/asm/sysreg.h
index 8c209aa17273..551f30ace4db 100644
--- a/arch/arm64/include/asm/sysreg.h
+++ b/arch/arm64/include/asm/sysreg.h
@@ -617,6 +617,9 @@
 #define ID_AA64ISAR0_SHA1_SHIFT		8
 #define ID_AA64ISAR0_AES_SHIFT		4
 
+#define ID_AA64ISAR0_TLB_RANGE_NI	0x0
+#define ID_AA64ISAR0_TLB_RANGE		0x2
+
 /* id_aa64isar1 */
 #define ID_AA64ISAR1_I8MM_SHIFT		52
 #define ID_AA64ISAR1_DGH_SHIFT		48
diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c
index e877f56ff1ab..2f5adefef34d 100644
--- a/arch/arm64/kernel/cpufeature.c
+++ b/arch/arm64/kernel/cpufeature.c
@@ -1893,6 +1893,16 @@ static const struct arm64_cpu_capabilities arm64_features[] = {
 		.min_field_value = 1,
 		.matches = has_cpuid_feature,
 	},
+	{
+		.desc = "TLB range maintenance instructions",
+		.capability = ARM64_HAS_TLB_RANGE,
+		.type = ARM64_CPUCAP_SYSTEM_FEATURE,
+		.matches = has_cpuid_feature,
+		.sys_reg = SYS_ID_AA64ISAR0_EL1,
+		.field_pos = ID_AA64ISAR0_TLB_SHIFT,
+		.sign = FTR_UNSIGNED,
+		.min_field_value = ID_AA64ISAR0_TLB_RANGE,
+	},
 #ifdef CONFIG_ARM64_HW_AFDBM
 	{
 		/*

From 7c78f67e9bd97478d56157c2ad53823668b5b822 Mon Sep 17 00:00:00 2001
From: Zhenyu Ye <yezhenyu2@huawei.com>
Date: Wed, 15 Jul 2020 15:19:44 +0800
Subject: [PATCH 347/502] arm64: enable tlbi range instructions

TLBI RANGE feature instoduces new assembly instructions and only
support by binutils >= 2.30.  Add necessary Kconfig logic to allow
this to be enabled and pass '-march=armv8.4-a' to KBUILD_CFLAGS.

Signed-off-by: Zhenyu Ye <yezhenyu2@huawei.com>
Link: https://lore.kernel.org/r/20200715071945.897-3-yezhenyu2@huawei.com
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 arch/arm64/Kconfig  | 14 ++++++++++++++
 arch/arm64/Makefile |  7 +++++++
 2 files changed, 21 insertions(+)

diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index 66dc41fd49f2..0f39468dbc60 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -1596,6 +1596,20 @@ config ARM64_AMU_EXTN
 	  correctly reflect reality. Most commonly, the value read will be 0,
 	  indicating that the counter is not enabled.
 
+config AS_HAS_ARMV8_4
+	def_bool $(cc-option,-Wa$(comma)-march=armv8.4-a)
+
+config ARM64_TLB_RANGE
+	bool "Enable support for tlbi range feature"
+	default y
+	depends on AS_HAS_ARMV8_4
+	help
+	  ARMv8.4-TLBI provides TLBI invalidation instruction that apply to a
+	  range of input addresses.
+
+	  The feature introduces new assembly instructions, and they were
+	  support when binutils >= 2.30.
+
 endmenu
 
 menu "ARMv8.5 architectural features"
diff --git a/arch/arm64/Makefile b/arch/arm64/Makefile
index a0d94d063fa8..4e823b97c92e 100644
--- a/arch/arm64/Makefile
+++ b/arch/arm64/Makefile
@@ -82,11 +82,18 @@ endif
 # compiler to generate them and consequently to break the single image contract
 # we pass it only to the assembler. This option is utilized only in case of non
 # integrated assemblers.
+ifneq ($(CONFIG_AS_HAS_ARMV8_4), y)
 branch-prot-flags-$(CONFIG_AS_HAS_PAC) += -Wa,-march=armv8.3-a
 endif
+endif
 
 KBUILD_CFLAGS += $(branch-prot-flags-y)
 
+ifeq ($(CONFIG_AS_HAS_ARMV8_4), y)
+# make sure to pass the newest target architecture to -march.
+KBUILD_CFLAGS	+= -Wa,-march=armv8.4-a
+endif
+
 ifeq ($(CONFIG_SHADOW_CALL_STACK), y)
 KBUILD_CFLAGS	+= -ffixed-x18
 endif

From d1d3aa98b1d4826a19adfefb69b96142a0cac633 Mon Sep 17 00:00:00 2001
From: Zhenyu Ye <yezhenyu2@huawei.com>
Date: Wed, 15 Jul 2020 15:19:45 +0800
Subject: [PATCH 348/502] arm64: tlb: Use the TLBI RANGE feature in arm64

Add __TLBI_VADDR_RANGE macro and rewrite __flush_tlb_range().

When cpu supports TLBI feature, the minimum range granularity is
decided by 'scale', so we can not flush all pages by one instruction
in some cases.

For example, when the pages = 0xe81a, let's start 'scale' from
maximum, and find right 'num' for each 'scale':

1. scale = 3, we can flush no pages because the minimum range is
   2^(5*3 + 1) = 0x10000.
2. scale = 2, the minimum range is 2^(5*2 + 1) = 0x800, we can
   flush 0xe800 pages this time, the num = 0xe800/0x800 - 1 = 0x1c.
   Remaining pages is 0x1a;
3. scale = 1, the minimum range is 2^(5*1 + 1) = 0x40, no page
   can be flushed.
4. scale = 0, we flush the remaining 0x1a pages, the num =
   0x1a/0x2 - 1 = 0xd.

However, in most scenarios, the pages = 1 when flush_tlb_range() is
called. Start from scale = 3 or other proper value (such as scale =
ilog2(pages)), will incur extra overhead.
So increase 'scale' from 0 to maximum, the flush order is exactly
opposite to the example.

Signed-off-by: Zhenyu Ye <yezhenyu2@huawei.com>
Link: https://lore.kernel.org/r/20200715071945.897-4-yezhenyu2@huawei.com
[catalin.marinas@arm.com: removed unnecessary masks in __TLBI_VADDR_RANGE]
[catalin.marinas@arm.com: __TLB_RANGE_NUM subtracts 1]
[catalin.marinas@arm.com: minor adjustments to the comments]
[catalin.marinas@arm.com: introduce system_supports_tlb_range()]
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 arch/arm64/include/asm/cpufeature.h |   6 ++
 arch/arm64/include/asm/tlbflush.h   | 156 ++++++++++++++++++++++------
 2 files changed, 132 insertions(+), 30 deletions(-)

diff --git a/arch/arm64/include/asm/cpufeature.h b/arch/arm64/include/asm/cpufeature.h
index 5d1f4ae42799..cf56daa95a7d 100644
--- a/arch/arm64/include/asm/cpufeature.h
+++ b/arch/arm64/include/asm/cpufeature.h
@@ -692,6 +692,12 @@ static inline bool system_supports_bti(void)
 	return IS_ENABLED(CONFIG_ARM64_BTI) && cpus_have_const_cap(ARM64_BTI);
 }
 
+static inline bool system_supports_tlb_range(void)
+{
+	return IS_ENABLED(CONFIG_ARM64_TLB_RANGE) &&
+		cpus_have_const_cap(ARM64_HAS_TLB_RANGE);
+}
+
 #define ARM64_BP_HARDEN_UNKNOWN		-1
 #define ARM64_BP_HARDEN_WA_NEEDED	0
 #define ARM64_BP_HARDEN_NOT_REQUIRED	1
diff --git a/arch/arm64/include/asm/tlbflush.h b/arch/arm64/include/asm/tlbflush.h
index 2cb275efcea3..d493174415db 100644
--- a/arch/arm64/include/asm/tlbflush.h
+++ b/arch/arm64/include/asm/tlbflush.h
@@ -60,6 +60,31 @@
 		__ta;						\
 	})
 
+/*
+ * Get translation granule of the system, which is decided by
+ * PAGE_SIZE.  Used by TTL.
+ *  - 4KB	: 1
+ *  - 16KB	: 2
+ *  - 64KB	: 3
+ */
+#define TLBI_TTL_TG_4K		1
+#define TLBI_TTL_TG_16K		2
+#define TLBI_TTL_TG_64K		3
+
+static inline unsigned long get_trans_granule(void)
+{
+	switch (PAGE_SIZE) {
+	case SZ_4K:
+		return TLBI_TTL_TG_4K;
+	case SZ_16K:
+		return TLBI_TTL_TG_16K;
+	case SZ_64K:
+		return TLBI_TTL_TG_64K;
+	default:
+		return 0;
+	}
+}
+
 /*
  * Level-based TLBI operations.
  *
@@ -73,9 +98,6 @@
  * in asm/stage2_pgtable.h.
  */
 #define TLBI_TTL_MASK		GENMASK_ULL(47, 44)
-#define TLBI_TTL_TG_4K		1
-#define TLBI_TTL_TG_16K		2
-#define TLBI_TTL_TG_64K		3
 
 #define __tlbi_level(op, addr, level) do {				\
 	u64 arg = addr;							\
@@ -83,19 +105,7 @@
 	if (cpus_have_const_cap(ARM64_HAS_ARMv8_4_TTL) &&		\
 	    level) {							\
 		u64 ttl = level & 3;					\
-									\
-		switch (PAGE_SIZE) {					\
-		case SZ_4K:						\
-			ttl |= TLBI_TTL_TG_4K << 2;			\
-			break;						\
-		case SZ_16K:						\
-			ttl |= TLBI_TTL_TG_16K << 2;			\
-			break;						\
-		case SZ_64K:						\
-			ttl |= TLBI_TTL_TG_64K << 2;			\
-			break;						\
-		}							\
-									\
+		ttl |= get_trans_granule() << 2;			\
 		arg &= ~TLBI_TTL_MASK;					\
 		arg |= FIELD_PREP(TLBI_TTL_MASK, ttl);			\
 	}								\
@@ -108,6 +118,44 @@
 		__tlbi_level(op, (arg | USER_ASID_FLAG), level);	\
 } while (0)
 
+/*
+ * This macro creates a properly formatted VA operand for the TLB RANGE.
+ * The value bit assignments are:
+ *
+ * +----------+------+-------+-------+-------+----------------------+
+ * |   ASID   |  TG  | SCALE |  NUM  |  TTL  |        BADDR         |
+ * +-----------------+-------+-------+-------+----------------------+
+ * |63      48|47  46|45   44|43   39|38   37|36                   0|
+ *
+ * The address range is determined by below formula:
+ * [BADDR, BADDR + (NUM + 1) * 2^(5*SCALE + 1) * PAGESIZE)
+ *
+ */
+#define __TLBI_VADDR_RANGE(addr, asid, scale, num, ttl)		\
+	({							\
+		unsigned long __ta = (addr) >> PAGE_SHIFT;	\
+		__ta &= GENMASK_ULL(36, 0);			\
+		__ta |= (unsigned long)(ttl) << 37;		\
+		__ta |= (unsigned long)(num) << 39;		\
+		__ta |= (unsigned long)(scale) << 44;		\
+		__ta |= get_trans_granule() << 46;		\
+		__ta |= (unsigned long)(asid) << 48;		\
+		__ta;						\
+	})
+
+/* These macros are used by the TLBI RANGE feature. */
+#define __TLBI_RANGE_PAGES(num, scale)	\
+	((unsigned long)((num) + 1) << (5 * (scale) + 1))
+#define MAX_TLBI_RANGE_PAGES		__TLBI_RANGE_PAGES(31, 3)
+
+/*
+ * Generate 'num' values from -1 to 30 with -1 rejected by the
+ * __flush_tlb_range() loop below.
+ */
+#define TLBI_RANGE_MASK			GENMASK_ULL(4, 0)
+#define __TLBI_RANGE_NUM(pages, scale)	\
+	((((pages) >> (5 * (scale) + 1)) & TLBI_RANGE_MASK) - 1)
+
 /*
  *	TLB Invalidation
  *	================
@@ -231,32 +279,80 @@ static inline void __flush_tlb_range(struct vm_area_struct *vma,
 				     unsigned long stride, bool last_level,
 				     int tlb_level)
 {
+	int num = 0;
+	int scale = 0;
 	unsigned long asid = ASID(vma->vm_mm);
 	unsigned long addr;
+	unsigned long pages;
 
 	start = round_down(start, stride);
 	end = round_up(end, stride);
+	pages = (end - start) >> PAGE_SHIFT;
 
-	if ((end - start) >= (MAX_TLBI_OPS * stride)) {
+	/*
+	 * When not uses TLB range ops, we can handle up to
+	 * (MAX_TLBI_OPS - 1) pages;
+	 * When uses TLB range ops, we can handle up to
+	 * (MAX_TLBI_RANGE_PAGES - 1) pages.
+	 */
+	if ((!system_supports_tlb_range() &&
+	     (end - start) >= (MAX_TLBI_OPS * stride)) ||
+	    pages >= MAX_TLBI_RANGE_PAGES) {
 		flush_tlb_mm(vma->vm_mm);
 		return;
 	}
 
-	/* Convert the stride into units of 4k */
-	stride >>= 12;
-
-	start = __TLBI_VADDR(start, asid);
-	end = __TLBI_VADDR(end, asid);
-
 	dsb(ishst);
-	for (addr = start; addr < end; addr += stride) {
-		if (last_level) {
-			__tlbi_level(vale1is, addr, tlb_level);
-			__tlbi_user_level(vale1is, addr, tlb_level);
-		} else {
-			__tlbi_level(vae1is, addr, tlb_level);
-			__tlbi_user_level(vae1is, addr, tlb_level);
+
+	/*
+	 * When the CPU does not support TLB range operations, flush the TLB
+	 * entries one by one at the granularity of 'stride'. If the the TLB
+	 * range ops are supported, then:
+	 *
+	 * 1. If 'pages' is odd, flush the first page through non-range
+	 *    operations;
+	 *
+	 * 2. For remaining pages: the minimum range granularity is decided
+	 *    by 'scale', so multiple range TLBI operations may be required.
+	 *    Start from scale = 0, flush the corresponding number of pages
+	 *    ((num+1)*2^(5*scale+1) starting from 'addr'), then increase it
+	 *    until no pages left.
+	 *
+	 * Note that certain ranges can be represented by either num = 31 and
+	 * scale or num = 0 and scale + 1. The loop below favours the latter
+	 * since num is limited to 30 by the __TLBI_RANGE_NUM() macro.
+	 */
+	while (pages > 0) {
+		if (!system_supports_tlb_range() ||
+		    pages % 2 == 1) {
+			addr = __TLBI_VADDR(start, asid);
+			if (last_level) {
+				__tlbi_level(vale1is, addr, tlb_level);
+				__tlbi_user_level(vale1is, addr, tlb_level);
+			} else {
+				__tlbi_level(vae1is, addr, tlb_level);
+				__tlbi_user_level(vae1is, addr, tlb_level);
+			}
+			start += stride;
+			pages -= stride >> PAGE_SHIFT;
+			continue;
 		}
+
+		num = __TLBI_RANGE_NUM(pages, scale);
+		if (num >= 0) {
+			addr = __TLBI_VADDR_RANGE(start, asid, scale,
+						  num, tlb_level);
+			if (last_level) {
+				__tlbi(rvale1is, addr);
+				__tlbi_user(rvale1is, addr);
+			} else {
+				__tlbi(rvae1is, addr);
+				__tlbi_user(rvae1is, addr);
+			}
+			start += __TLBI_RANGE_PAGES(num, scale) << PAGE_SHIFT;
+			pages -= __TLBI_RANGE_PAGES(num, scale);
+		}
+		scale++;
 	}
 	dsb(ish);
 }

From 5be542e945cb39a2457aa2cfe8b84aac95ef0f2d Mon Sep 17 00:00:00 2001
From: Herbert Xu <herbert@gondor.apana.org.au>
Date: Thu, 16 Jul 2020 16:36:50 +1000
Subject: [PATCH 349/502] lockdep: Move list.h inclusion into lockdep.h

Currently lockdep_types.h includes list.h without actually using any
of its macros or functions.  All it needs are the type definitions
which were moved into types.h long ago.  This potentially causes
inclusion loops because both are included by many core header
files.

This patch moves the list.h inclusion into lockdep.h.  Note that
we could probably remove it completely but that could potentially
result in compile failures should any end users not include list.h
directly and also be unlucky enough to not get list.h via some other
header file.

Reported-by: Petr Mladek <pmladek@suse.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Tested-by: Petr Mladek <pmladek@suse.com>
Link: https://lkml.kernel.org/r/20200716063649.GA23065@gondor.apana.org.au
---
 include/linux/lockdep.h       | 1 +
 include/linux/lockdep_types.h | 2 --
 2 files changed, 1 insertion(+), 2 deletions(-)

diff --git a/include/linux/lockdep.h b/include/linux/lockdep.h
index fd04b9e96091..7aafba0ddcf9 100644
--- a/include/linux/lockdep.h
+++ b/include/linux/lockdep.h
@@ -22,6 +22,7 @@ extern int lock_stat;
 #ifdef CONFIG_LOCKDEP
 
 #include <linux/linkage.h>
+#include <linux/list.h>
 #include <linux/debug_locks.h>
 #include <linux/stacktrace.h>
 
diff --git a/include/linux/lockdep_types.h b/include/linux/lockdep_types.h
index 7b9350624577..bb35b449f533 100644
--- a/include/linux/lockdep_types.h
+++ b/include/linux/lockdep_types.h
@@ -32,8 +32,6 @@ enum lockdep_wait_type {
 
 #ifdef CONFIG_LOCKDEP
 
-#include <linux/list.h>
-
 /*
  * We'd rather not expose kernel/lockdep_states.h this wide, but we do need
  * the total number of states... :-(

From 482cbb6cc33dca60091048631cd0a8dde72c3da7 Mon Sep 17 00:00:00 2001
From: "Alexander A. Klimov" <grandmaster@al2klimov.de>
Date: Mon, 13 Jul 2020 13:57:28 +0200
Subject: [PATCH 350/502] docs: locking: Replace HTTP links with HTTPS ones

Rationale:
Reduces attack surface on kernel devs opening the links for MITM
as HTTPS traffic is much harder to manipulate.

Deterministic algorithm:
For each file:
  If not .svg:
    For each line:
      If doesn't contain `\bxmlns\b`:
        For each link, `\bhttp://[^# \t\r\n]*(?:\w|/)`:
	  If neither `\bgnu\.org/license`, nor `\bmozilla\.org/MPL\b`:
            If both the HTTP and HTTPS versions
            return 200 OK and serve the same content:
              Replace HTTP with HTTPS.

Signed-off-by: Alexander A. Klimov <grandmaster@al2klimov.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/20200713115728.33905-1-grandmaster@al2klimov.de
---
 Documentation/locking/mutex-design.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Documentation/locking/mutex-design.rst b/Documentation/locking/mutex-design.rst
index 4d8236b81fa5..8f3e9a5141f9 100644
--- a/Documentation/locking/mutex-design.rst
+++ b/Documentation/locking/mutex-design.rst
@@ -18,7 +18,7 @@ as an alternative to these. This new data structure provided a number
 of advantages, including simpler interfaces, and at that time smaller
 code (see Disadvantages).
 
-[1] http://lwn.net/Articles/164802/
+[1] https://lwn.net/Articles/164802/
 
 Implementation
 --------------

From a9232dc5607dbada801f2fe83ea307cda762969a Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Sat, 11 Jul 2020 17:59:54 +0300
Subject: [PATCH 351/502] rwsem: fix commas in initialisation

Leading comma prevents arbitrary reordering of initialisation clauses.
The whole point of C99 initialisation is to allow any such reordering.

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/20200711145954.GA1178171@localhost.localdomain
---
 include/linux/rwsem.h | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/include/linux/rwsem.h b/include/linux/rwsem.h
index 7e5b2a4eb560..25e3fde85617 100644
--- a/include/linux/rwsem.h
+++ b/include/linux/rwsem.h
@@ -60,39 +60,39 @@ static inline int rwsem_is_locked(struct rw_semaphore *sem)
 }
 
 #define RWSEM_UNLOCKED_VALUE		0L
-#define __RWSEM_INIT_COUNT(name)	.count = ATOMIC_LONG_INIT(RWSEM_UNLOCKED_VALUE)
+#define __RWSEM_COUNT_INIT(name)	.count = ATOMIC_LONG_INIT(RWSEM_UNLOCKED_VALUE)
 
 /* Common initializer macros and functions */
 
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
 # define __RWSEM_DEP_MAP_INIT(lockname)			\
-	, .dep_map = {					\
+	.dep_map = {					\
 		.name = #lockname,			\
 		.wait_type_inner = LD_WAIT_SLEEP,	\
-	}
+	},
 #else
 # define __RWSEM_DEP_MAP_INIT(lockname)
 #endif
 
 #ifdef CONFIG_DEBUG_RWSEMS
-# define __DEBUG_RWSEM_INITIALIZER(lockname) , .magic = &lockname
+# define __RWSEM_DEBUG_INIT(lockname) .magic = &lockname,
 #else
-# define __DEBUG_RWSEM_INITIALIZER(lockname)
+# define __RWSEM_DEBUG_INIT(lockname)
 #endif
 
 #ifdef CONFIG_RWSEM_SPIN_ON_OWNER
-#define __RWSEM_OPT_INIT(lockname) , .osq = OSQ_LOCK_UNLOCKED
+#define __RWSEM_OPT_INIT(lockname) .osq = OSQ_LOCK_UNLOCKED,
 #else
 #define __RWSEM_OPT_INIT(lockname)
 #endif
 
 #define __RWSEM_INITIALIZER(name)				\
-	{ __RWSEM_INIT_COUNT(name),				\
+	{ __RWSEM_COUNT_INIT(name),				\
 	  .owner = ATOMIC_LONG_INIT(0),				\
-	  .wait_list = LIST_HEAD_INIT((name).wait_list),	\
-	  .wait_lock = __RAW_SPIN_LOCK_UNLOCKED(name.wait_lock)	\
 	  __RWSEM_OPT_INIT(name)				\
-	  __DEBUG_RWSEM_INITIALIZER(name)			\
+	  .wait_lock = __RAW_SPIN_LOCK_UNLOCKED(name.wait_lock),\
+	  .wait_list = LIST_HEAD_INIT((name).wait_list),	\
+	  __RWSEM_DEBUG_INIT(name)				\
 	  __RWSEM_DEP_MAP_INIT(name) }
 
 #define DECLARE_RWSEM(name) \

From 0f85c4805184765ff35e0079b3241ee8f25d1b2b Mon Sep 17 00:00:00 2001
From: Qinglang Miao <miaoqinglang@huawei.com>
Date: Thu, 16 Jul 2020 16:47:47 +0800
Subject: [PATCH 352/502] debugobjects: Convert to DEFINE_SHOW_ATTRIBUTE

Use DEFINE_SHOW_ATTRIBUTE macro to simplify the code.

[ tglx: Distangled it from the mess in -next ]

Signed-off-by: Qinglang Miao <miaoqinglang@huawei.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: hch@lst.de
Link: https://lkml.kernel.org/r/20200716084747.8034-1-miaoqinglang@huawei.com
---
 lib/debugobjects.c | 13 +------------
 1 file changed, 1 insertion(+), 12 deletions(-)

diff --git a/lib/debugobjects.c b/lib/debugobjects.c
index 48054dbf1b51..fe4557955d97 100644
--- a/lib/debugobjects.c
+++ b/lib/debugobjects.c
@@ -1022,18 +1022,7 @@ static int debug_stats_show(struct seq_file *m, void *v)
 	seq_printf(m, "objs_freed    :%d\n", debug_objects_freed);
 	return 0;
 }
-
-static int debug_stats_open(struct inode *inode, struct file *filp)
-{
-	return single_open(filp, debug_stats_show, NULL);
-}
-
-static const struct file_operations debug_stats_fops = {
-	.open		= debug_stats_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= single_release,
-};
+DEFINE_SHOW_ATTRIBUTE(debug_stats);
 
 static int __init debug_objects_init_debugfs(void)
 {

From 9180bd467f9abdb44afde650d07e3b9dd66d837c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Andr=C3=A9=20Almeida?= <andrealmeid@collabora.com>
Date: Thu, 2 Jul 2020 17:28:40 -0300
Subject: [PATCH 353/502] futex: Remove put_futex_key()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Since 4b39f99c ("futex: Remove {get,drop}_futex_key_refs()"),
put_futex_key() is empty.

Remove all references for this function and the then redundant labels.

Signed-off-by: André Almeida <andrealmeid@collabora.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://lkml.kernel.org/r/20200702202843.520764-2-andrealmeid@collabora.com
---
 kernel/futex.c | 61 ++++++++++----------------------------------------
 1 file changed, 12 insertions(+), 49 deletions(-)

diff --git a/kernel/futex.c b/kernel/futex.c
index e646661f6282..bd9adfca5d51 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -677,10 +677,6 @@ out:
 	return err;
 }
 
-static inline void put_futex_key(union futex_key *key)
-{
-}
-
 /**
  * fault_in_user_writeable() - Fault in user address and verify RW access
  * @uaddr:	pointer to faulting user space address
@@ -1617,7 +1613,7 @@ futex_wake(u32 __user *uaddr, unsigned int flags, int nr_wake, u32 bitset)
 
 	/* Make sure we really have tasks to wakeup */
 	if (!hb_waiters_pending(hb))
-		goto out_put_key;
+		goto out;
 
 	spin_lock(&hb->lock);
 
@@ -1640,8 +1636,6 @@ futex_wake(u32 __user *uaddr, unsigned int flags, int nr_wake, u32 bitset)
 
 	spin_unlock(&hb->lock);
 	wake_up_q(&wake_q);
-out_put_key:
-	put_futex_key(&key);
 out:
 	return ret;
 }
@@ -1712,7 +1706,7 @@ retry:
 		goto out;
 	ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2, FUTEX_WRITE);
 	if (unlikely(ret != 0))
-		goto out_put_key1;
+		goto out;
 
 	hb1 = hash_futex(&key1);
 	hb2 = hash_futex(&key2);
@@ -1730,13 +1724,13 @@ retry_private:
 			 * an MMU, but we might get them from range checking
 			 */
 			ret = op_ret;
-			goto out_put_keys;
+			goto out;
 		}
 
 		if (op_ret == -EFAULT) {
 			ret = fault_in_user_writeable(uaddr2);
 			if (ret)
-				goto out_put_keys;
+				goto out;
 		}
 
 		if (!(flags & FLAGS_SHARED)) {
@@ -1744,8 +1738,6 @@ retry_private:
 			goto retry_private;
 		}
 
-		put_futex_key(&key2);
-		put_futex_key(&key1);
 		cond_resched();
 		goto retry;
 	}
@@ -1781,10 +1773,6 @@ retry_private:
 out_unlock:
 	double_unlock_hb(hb1, hb2);
 	wake_up_q(&wake_q);
-out_put_keys:
-	put_futex_key(&key2);
-out_put_key1:
-	put_futex_key(&key1);
 out:
 	return ret;
 }
@@ -1996,7 +1984,7 @@ retry:
 	ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2,
 			    requeue_pi ? FUTEX_WRITE : FUTEX_READ);
 	if (unlikely(ret != 0))
-		goto out_put_key1;
+		goto out;
 
 	/*
 	 * The check above which compares uaddrs is not sufficient for
@@ -2004,7 +1992,7 @@ retry:
 	 */
 	if (requeue_pi && match_futex(&key1, &key2)) {
 		ret = -EINVAL;
-		goto out_put_keys;
+		goto out;
 	}
 
 	hb1 = hash_futex(&key1);
@@ -2025,13 +2013,11 @@ retry_private:
 
 			ret = get_user(curval, uaddr1);
 			if (ret)
-				goto out_put_keys;
+				goto out;
 
 			if (!(flags & FLAGS_SHARED))
 				goto retry_private;
 
-			put_futex_key(&key2);
-			put_futex_key(&key1);
 			goto retry;
 		}
 		if (curval != *cmpval) {
@@ -2090,8 +2076,6 @@ retry_private:
 		case -EFAULT:
 			double_unlock_hb(hb1, hb2);
 			hb_waiters_dec(hb2);
-			put_futex_key(&key2);
-			put_futex_key(&key1);
 			ret = fault_in_user_writeable(uaddr2);
 			if (!ret)
 				goto retry;
@@ -2106,8 +2090,6 @@ retry_private:
 			 */
 			double_unlock_hb(hb1, hb2);
 			hb_waiters_dec(hb2);
-			put_futex_key(&key2);
-			put_futex_key(&key1);
 			/*
 			 * Handle the case where the owner is in the middle of
 			 * exiting. Wait for the exit to complete otherwise
@@ -2217,10 +2199,6 @@ out_unlock:
 	wake_up_q(&wake_q);
 	hb_waiters_dec(hb2);
 
-out_put_keys:
-	put_futex_key(&key2);
-out_put_key1:
-	put_futex_key(&key1);
 out:
 	return ret ? ret : task_count;
 }
@@ -2697,7 +2675,6 @@ retry_private:
 		if (!(flags & FLAGS_SHARED))
 			goto retry_private;
 
-		put_futex_key(&q->key);
 		goto retry;
 	}
 
@@ -2707,8 +2684,6 @@ retry_private:
 	}
 
 out:
-	if (ret)
-		put_futex_key(&q->key);
 	return ret;
 }
 
@@ -2853,7 +2828,6 @@ retry_private:
 			 * - EAGAIN: The user space value changed.
 			 */
 			queue_unlock(hb);
-			put_futex_key(&q.key);
 			/*
 			 * Handle the case where the owner is in the middle of
 			 * exiting. Wait for the exit to complete otherwise
@@ -2961,13 +2935,11 @@ no_block:
 		put_pi_state(pi_state);
 	}
 
-	goto out_put_key;
+	goto out;
 
 out_unlock_put_key:
 	queue_unlock(hb);
 
-out_put_key:
-	put_futex_key(&q.key);
 out:
 	if (to) {
 		hrtimer_cancel(&to->timer);
@@ -2980,12 +2952,11 @@ uaddr_faulted:
 
 	ret = fault_in_user_writeable(uaddr);
 	if (ret)
-		goto out_put_key;
+		goto out;
 
 	if (!(flags & FLAGS_SHARED))
 		goto retry_private;
 
-	put_futex_key(&q.key);
 	goto retry;
 }
 
@@ -3114,16 +3085,13 @@ retry:
 out_unlock:
 	spin_unlock(&hb->lock);
 out_putkey:
-	put_futex_key(&key);
 	return ret;
 
 pi_retry:
-	put_futex_key(&key);
 	cond_resched();
 	goto retry;
 
 pi_faulted:
-	put_futex_key(&key);
 
 	ret = fault_in_user_writeable(uaddr);
 	if (!ret)
@@ -3265,7 +3233,7 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
 	 */
 	ret = futex_wait_setup(uaddr, val, flags, &q, &hb);
 	if (ret)
-		goto out_key2;
+		goto out;
 
 	/*
 	 * The check above which compares uaddrs is not sufficient for
@@ -3274,7 +3242,7 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
 	if (match_futex(&q.key, &key2)) {
 		queue_unlock(hb);
 		ret = -EINVAL;
-		goto out_put_keys;
+		goto out;
 	}
 
 	/* Queue the futex_q, drop the hb lock, wait for wakeup. */
@@ -3284,7 +3252,7 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
 	ret = handle_early_requeue_pi_wakeup(hb, &q, &key2, to);
 	spin_unlock(&hb->lock);
 	if (ret)
-		goto out_put_keys;
+		goto out;
 
 	/*
 	 * In order for us to be here, we know our q.key == key2, and since
@@ -3374,11 +3342,6 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
 		ret = -EWOULDBLOCK;
 	}
 
-out_put_keys:
-	put_futex_key(&q.key);
-out_key2:
-	put_futex_key(&key2);
-
 out:
 	if (to) {
 		hrtimer_cancel(&to->timer);

From d7c5ed73b19c4640426d9c106f70ec2cb532034d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Andr=C3=A9=20Almeida?= <andrealmeid@collabora.com>
Date: Thu, 2 Jul 2020 17:28:41 -0300
Subject: [PATCH 354/502] futex: Remove needless goto's
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

As stated in the coding style documentation, "if there is no cleanup
needed then just return directly", instead of jumping to a label and
then returning.

Remove such goto's and replace with a return statement.  When there's a
ternary operator on the return value, replace it with the result of the
operation when it is logically possible to determine it by the control
flow.

Signed-off-by: André Almeida <andrealmeid@collabora.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://lkml.kernel.org/r/20200702202843.520764-3-andrealmeid@collabora.com
---
 kernel/futex.c | 40 ++++++++++++++++------------------------
 1 file changed, 16 insertions(+), 24 deletions(-)

diff --git a/kernel/futex.c b/kernel/futex.c
index bd9adfca5d51..362fbca6d614 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -1607,13 +1607,13 @@ futex_wake(u32 __user *uaddr, unsigned int flags, int nr_wake, u32 bitset)
 
 	ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &key, FUTEX_READ);
 	if (unlikely(ret != 0))
-		goto out;
+		return ret;
 
 	hb = hash_futex(&key);
 
 	/* Make sure we really have tasks to wakeup */
 	if (!hb_waiters_pending(hb))
-		goto out;
+		return ret;
 
 	spin_lock(&hb->lock);
 
@@ -1636,7 +1636,6 @@ futex_wake(u32 __user *uaddr, unsigned int flags, int nr_wake, u32 bitset)
 
 	spin_unlock(&hb->lock);
 	wake_up_q(&wake_q);
-out:
 	return ret;
 }
 
@@ -1703,10 +1702,10 @@ futex_wake_op(u32 __user *uaddr1, unsigned int flags, u32 __user *uaddr2,
 retry:
 	ret = get_futex_key(uaddr1, flags & FLAGS_SHARED, &key1, FUTEX_READ);
 	if (unlikely(ret != 0))
-		goto out;
+		return ret;
 	ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2, FUTEX_WRITE);
 	if (unlikely(ret != 0))
-		goto out;
+		return ret;
 
 	hb1 = hash_futex(&key1);
 	hb2 = hash_futex(&key2);
@@ -1724,13 +1723,13 @@ retry_private:
 			 * an MMU, but we might get them from range checking
 			 */
 			ret = op_ret;
-			goto out;
+			return ret;
 		}
 
 		if (op_ret == -EFAULT) {
 			ret = fault_in_user_writeable(uaddr2);
 			if (ret)
-				goto out;
+				return ret;
 		}
 
 		if (!(flags & FLAGS_SHARED)) {
@@ -1773,7 +1772,6 @@ retry_private:
 out_unlock:
 	double_unlock_hb(hb1, hb2);
 	wake_up_q(&wake_q);
-out:
 	return ret;
 }
 
@@ -1980,20 +1978,18 @@ static int futex_requeue(u32 __user *uaddr1, unsigned int flags,
 retry:
 	ret = get_futex_key(uaddr1, flags & FLAGS_SHARED, &key1, FUTEX_READ);
 	if (unlikely(ret != 0))
-		goto out;
+		return ret;
 	ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2,
 			    requeue_pi ? FUTEX_WRITE : FUTEX_READ);
 	if (unlikely(ret != 0))
-		goto out;
+		return ret;
 
 	/*
 	 * The check above which compares uaddrs is not sufficient for
 	 * shared futexes. We need to compare the keys:
 	 */
-	if (requeue_pi && match_futex(&key1, &key2)) {
-		ret = -EINVAL;
-		goto out;
-	}
+	if (requeue_pi && match_futex(&key1, &key2))
+		return -EINVAL;
 
 	hb1 = hash_futex(&key1);
 	hb2 = hash_futex(&key2);
@@ -2013,7 +2009,7 @@ retry_private:
 
 			ret = get_user(curval, uaddr1);
 			if (ret)
-				goto out;
+				return ret;
 
 			if (!(flags & FLAGS_SHARED))
 				goto retry_private;
@@ -2079,7 +2075,7 @@ retry_private:
 			ret = fault_in_user_writeable(uaddr2);
 			if (!ret)
 				goto retry;
-			goto out;
+			return ret;
 		case -EBUSY:
 		case -EAGAIN:
 			/*
@@ -2198,8 +2194,6 @@ out_unlock:
 	double_unlock_hb(hb1, hb2);
 	wake_up_q(&wake_q);
 	hb_waiters_dec(hb2);
-
-out:
 	return ret ? ret : task_count;
 }
 
@@ -2545,7 +2539,7 @@ static int fixup_owner(u32 __user *uaddr, struct futex_q *q, int locked)
 		 */
 		if (q->pi_state->owner != current)
 			ret = fixup_pi_state_owner(uaddr, q, current);
-		goto out;
+		return ret ? ret : locked;
 	}
 
 	/*
@@ -2558,7 +2552,7 @@ static int fixup_owner(u32 __user *uaddr, struct futex_q *q, int locked)
 	 */
 	if (q->pi_state->owner == current) {
 		ret = fixup_pi_state_owner(uaddr, q, NULL);
-		goto out;
+		return ret;
 	}
 
 	/*
@@ -2572,8 +2566,7 @@ static int fixup_owner(u32 __user *uaddr, struct futex_q *q, int locked)
 				q->pi_state->owner);
 	}
 
-out:
-	return ret ? ret : locked;
+	return ret;
 }
 
 /**
@@ -2670,7 +2663,7 @@ retry_private:
 
 		ret = get_user(uval, uaddr);
 		if (ret)
-			goto out;
+			return ret;
 
 		if (!(flags & FLAGS_SHARED))
 			goto retry_private;
@@ -2683,7 +2676,6 @@ retry_private:
 		ret = -EWOULDBLOCK;
 	}
 
-out:
 	return ret;
 }
 

From 9261308598ad28b9a8a2237d881833e9f217244e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Andr=C3=A9=20Almeida?= <andrealmeid@collabora.com>
Date: Thu, 2 Jul 2020 17:28:43 -0300
Subject: [PATCH 355/502] futex: Consistently use fshared as boolean
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Since fshared is only conveying true/false values, declare it as bool.

In get_futex_key() the usage of fshared can be restricted to the first part
of the function. If fshared is false the function is terminated early and
the subsequent code can use a constant 'true' instead of the variable.

Signed-off-by: André Almeida <andrealmeid@collabora.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://lkml.kernel.org/r/20200702202843.520764-5-andrealmeid@collabora.com
---
 kernel/futex.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/kernel/futex.c b/kernel/futex.c
index 362fbca6d614..cda91755b77d 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -476,7 +476,7 @@ static u64 get_inode_sequence_number(struct inode *inode)
 /**
  * get_futex_key() - Get parameters which are the keys for a futex
  * @uaddr:	virtual address of the futex
- * @fshared:	0 for a PROCESS_PRIVATE futex, 1 for PROCESS_SHARED
+ * @fshared:	false for a PROCESS_PRIVATE futex, true for PROCESS_SHARED
  * @key:	address where result is stored.
  * @rw:		mapping needs to be read/write (values: FUTEX_READ,
  *              FUTEX_WRITE)
@@ -500,8 +500,8 @@ static u64 get_inode_sequence_number(struct inode *inode)
  *
  * lock_page() might sleep, the caller should not hold a spinlock.
  */
-static int
-get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key, enum futex_access rw)
+static int get_futex_key(u32 __user *uaddr, bool fshared, union futex_key *key,
+			 enum futex_access rw)
 {
 	unsigned long address = (unsigned long)uaddr;
 	struct mm_struct *mm = current->mm;
@@ -538,7 +538,7 @@ get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key, enum futex_a
 
 again:
 	/* Ignore any VERIFY_READ mapping (futex common case) */
-	if (unlikely(should_fail_futex(fshared)))
+	if (unlikely(should_fail_futex(true)))
 		return -EFAULT;
 
 	err = get_user_pages_fast(address, 1, FOLL_WRITE, &page);
@@ -626,7 +626,7 @@ again:
 		 * A RO anonymous page will never change and thus doesn't make
 		 * sense for futex operations.
 		 */
-		if (unlikely(should_fail_futex(fshared)) || ro) {
+		if (unlikely(should_fail_futex(true)) || ro) {
 			err = -EFAULT;
 			goto out;
 		}

From 9a71df495c3d29dab596bb590e73fd8b20106e2d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Andr=C3=A9=20Almeida?= <andrealmeid@collabora.com>
Date: Thu, 2 Jul 2020 17:28:42 -0300
Subject: [PATCH 356/502] futex: Remove unused or redundant includes
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Since 82af7aca ("Removal of FUTEX_FD"), some includes related to file
operations aren't needed anymore. More investigation around the includes
showed that a lot of includes aren't required for compilation, possible
due to redundant includes. Simplify the code by removing unused
includes.

Signed-off-by: André Almeida <andrealmeid@collabora.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://lkml.kernel.org/r/20200702202843.520764-4-andrealmeid@collabora.com
---
 kernel/futex.c | 17 -----------------
 1 file changed, 17 deletions(-)

diff --git a/kernel/futex.c b/kernel/futex.c
index cda91755b77d..4616d4ad609d 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -32,30 +32,13 @@
  *  "But they come in a choice of three flavours!"
  */
 #include <linux/compat.h>
-#include <linux/slab.h>
-#include <linux/poll.h>
-#include <linux/fs.h>
-#include <linux/file.h>
 #include <linux/jhash.h>
-#include <linux/init.h>
-#include <linux/futex.h>
-#include <linux/mount.h>
 #include <linux/pagemap.h>
 #include <linux/syscalls.h>
-#include <linux/signal.h>
-#include <linux/export.h>
-#include <linux/magic.h>
-#include <linux/pid.h>
-#include <linux/nsproxy.h>
-#include <linux/ptrace.h>
-#include <linux/sched/rt.h>
-#include <linux/sched/wake_q.h>
-#include <linux/sched/mm.h>
 #include <linux/hugetlb.h>
 #include <linux/freezer.h>
 #include <linux/memblock.h>
 #include <linux/fault-inject.h>
-#include <linux/refcount.h>
 
 #include <asm/futex.h>
 

From 7904aaa8b22fa07fd5457ee4a885cf9f665cb9c4 Mon Sep 17 00:00:00 2001
From: Heiko Carstens <hca@linux.ibm.com>
Date: Tue, 14 Jul 2020 07:43:26 +0200
Subject: [PATCH 357/502] s390/mm: fix typo in comment

Signed-off-by: Heiko Carstens <hca@linux.ibm.com>
---
 arch/s390/mm/fault.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/s390/mm/fault.c b/arch/s390/mm/fault.c
index d53c2e2ea1fd..598828517d9d 100644
--- a/arch/s390/mm/fault.c
+++ b/arch/s390/mm/fault.c
@@ -376,7 +376,7 @@ static noinline void do_fault_error(struct pt_regs *regs, int access,
  * routines.
  *
  * interruption code (int_code):
- *   04       Protection           ->  Write-Protection  (suprression)
+ *   04       Protection           ->  Write-Protection  (suppression)
  *   10       Segment translation  ->  Not present       (nullification)
  *   11       Page translation     ->  Not present       (nullification)
  *   3b       Region third trans.  ->  Not present       (nullification)

From 529683d4705b6b1fa1c2f902e859ad6a8d17e31e Mon Sep 17 00:00:00 2001
From: Julian Wiedmann <jwi@linux.ibm.com>
Date: Mon, 15 Jun 2020 17:23:11 +0200
Subject: [PATCH 358/502] s390/qdio: fix statistics for 128 SBALs

Old code would only scan up to 127 SBALs at once. So the last statistics
bucket was set aside to count "discovered 127 SBALs with new work"
events.

But nowadays we allow to scan all 128 SBALs for Output Queues, and a
subsequent patch will introduce the same for Input Queues.
So fix up the accounting to use the last bucket only when all 128 SBALs
have been discovered with new work.

Signed-off-by: Julian Wiedmann <jwi@linux.ibm.com>
Signed-off-by: Heiko Carstens <hca@linux.ibm.com>
---
 drivers/s390/cio/qdio.h       | 6 +-----
 drivers/s390/cio/qdio_debug.c | 2 +-
 drivers/s390/cio/qdio_main.c  | 9 +--------
 3 files changed, 3 insertions(+), 14 deletions(-)

diff --git a/drivers/s390/cio/qdio.h b/drivers/s390/cio/qdio.h
index bb1c8402c67d..7f0aa95585a4 100644
--- a/drivers/s390/cio/qdio.h
+++ b/drivers/s390/cio/qdio.h
@@ -166,11 +166,7 @@ struct qdio_dev_perf_stat {
 } ____cacheline_aligned;
 
 struct qdio_queue_perf_stat {
-	/*
-	 * Sorted into order-2 buckets: 1, 2-3, 4-7, ... 64-127, 128.
-	 * Since max. 127 SBALs are scanned reuse entry for 128 as queue full
-	 * aka 127 SBALs found.
-	 */
+	/* Sorted into order-2 buckets: 1, 2-3, 4-7, ... 64-127, 128. */
 	unsigned int nr_sbals[8];
 	unsigned int nr_sbal_error;
 	unsigned int nr_sbal_nop;
diff --git a/drivers/s390/cio/qdio_debug.c b/drivers/s390/cio/qdio_debug.c
index da95c923d81a..863d17c802ca 100644
--- a/drivers/s390/cio/qdio_debug.c
+++ b/drivers/s390/cio/qdio_debug.c
@@ -165,7 +165,7 @@ static int qstat_show(struct seq_file *m, void *v)
 	}
 
 	seq_printf(m, "\n1          2..        4..        8..        "
-		   "16..       32..       64..       127\n");
+		   "16..       32..       64..       128\n");
 	for (i = 0; i < ARRAY_SIZE(q->q_stats.nr_sbals); i++)
 		seq_printf(m, "%-10u ", q->q_stats.nr_sbals[i]);
 	seq_printf(m, "\nError      NOP        Total\n%-10u %-10u %-10u\n\n",
diff --git a/drivers/s390/cio/qdio_main.c b/drivers/s390/cio/qdio_main.c
index 0c919a11a46e..d4c699773070 100644
--- a/drivers/s390/cio/qdio_main.c
+++ b/drivers/s390/cio/qdio_main.c
@@ -413,15 +413,8 @@ static inline void qdio_stop_polling(struct qdio_q *q)
 
 static inline void account_sbals(struct qdio_q *q, unsigned int count)
 {
-	int pos;
-
 	q->q_stats.nr_sbal_total += count;
-	if (count == QDIO_MAX_BUFFERS_MASK) {
-		q->q_stats.nr_sbals[7]++;
-		return;
-	}
-	pos = ilog2(count);
-	q->q_stats.nr_sbals[pos]++;
+	q->q_stats.nr_sbals[ilog2(count)]++;
 }
 
 static void process_buffer_error(struct qdio_q *q, unsigned int start,

From 2bbf282a5e8e7e6b36586718b484a36117b6b8a0 Mon Sep 17 00:00:00 2001
From: Julian Wiedmann <jwi@linux.ibm.com>
Date: Wed, 17 Jun 2020 15:30:14 +0200
Subject: [PATCH 359/502] s390/qdio: allow to scan all 128 Input SBALs

The comment is inaccurate, qdio_inbound_q_moved() and/or its callers no
longer get confused by a count of 128 completed SBALs.

Scanning all 128 SBALs at once can improve IRQ reduction (as we now
place the ACK at the right spot), and reduce the amount of processing
needed to handle all completed SBALs.

Signed-off-by: Julian Wiedmann <jwi@linux.ibm.com>
Signed-off-by: Heiko Carstens <hca@linux.ibm.com>
---
 drivers/s390/cio/qdio_main.c | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/drivers/s390/cio/qdio_main.c b/drivers/s390/cio/qdio_main.c
index d4c699773070..0c1f186c6291 100644
--- a/drivers/s390/cio/qdio_main.c
+++ b/drivers/s390/cio/qdio_main.c
@@ -457,11 +457,7 @@ static int get_inbound_buffer_frontier(struct qdio_q *q, unsigned int start)
 
 	q->timestamp = get_tod_clock_fast();
 
-	/*
-	 * Don't check 128 buffers, as otherwise qdio_inbound_q_moved
-	 * would return 0.
-	 */
-	count = min(atomic_read(&q->nr_buf_used), QDIO_MAX_BUFFERS_MASK);
+	count = atomic_read(&q->nr_buf_used);
 	if (!count)
 		return 0;
 

From a709423f7a3a452e5fa7442425817c1bdccd7926 Mon Sep 17 00:00:00 2001
From: Julian Wiedmann <jwi@linux.ibm.com>
Date: Tue, 16 Jun 2020 14:13:00 +0200
Subject: [PATCH 360/502] s390/qdio: remove internal polling in non-thinint
 path

For non-thinint devices in LPAR, qdio polls an idle Input Queue for a
little while to catch more work. But platform support for thinints has
been around practically _forever_ by now, so this micro-optimization is
seeing 0 actual use. Remove it to reduce the overall complexity of the
hot path.

In the meantime we also grew support for driver-level polling
(eg. NAPI in qeth), so it's quite questionable how useful this would
actually be on current kernels.

Signed-off-by: Julian Wiedmann <jwi@linux.ibm.com>
Signed-off-by: Heiko Carstens <hca@linux.ibm.com>
---
 drivers/s390/cio/qdio.h      |  3 ---
 drivers/s390/cio/qdio_main.c | 26 ++------------------------
 2 files changed, 2 insertions(+), 27 deletions(-)

diff --git a/drivers/s390/cio/qdio.h b/drivers/s390/cio/qdio.h
index 7f0aa95585a4..cd2df4ff8e0e 100644
--- a/drivers/s390/cio/qdio.h
+++ b/drivers/s390/cio/qdio.h
@@ -15,7 +15,6 @@
 #define QDIO_BUSY_BIT_PATIENCE		(100 << 12)	/* 100 microseconds */
 #define QDIO_BUSY_BIT_RETRY_DELAY	10		/* 10 milliseconds */
 #define QDIO_BUSY_BIT_RETRIES		1000		/* = 10s retry time */
-#define QDIO_INPUT_THRESHOLD		(500 << 12)	/* 500 microseconds */
 
 enum qdio_irq_states {
 	QDIO_IRQ_STATE_INACTIVE,
@@ -181,8 +180,6 @@ struct qdio_input_q {
 	/* Batch of SBALs that we processed while polling the queue: */
 	unsigned int batch_start;
 	unsigned int batch_count;
-	/* last time of noticing incoming data */
-	u64 timestamp;
 };
 
 struct qdio_output_q {
diff --git a/drivers/s390/cio/qdio_main.c b/drivers/s390/cio/qdio_main.c
index 0c1f186c6291..4fab8bba2cdd 100644
--- a/drivers/s390/cio/qdio_main.c
+++ b/drivers/s390/cio/qdio_main.c
@@ -510,14 +510,7 @@ static int get_inbound_buffer_frontier(struct qdio_q *q, unsigned int start)
 
 static int qdio_inbound_q_moved(struct qdio_q *q, unsigned int start)
 {
-	int count;
-
-	count = get_inbound_buffer_frontier(q, start);
-
-	if (count && !is_thinint_irq(q->irq_ptr) && MACHINE_IS_LPAR)
-		q->u.in.timestamp = get_tod_clock();
-
-	return count;
+	return get_inbound_buffer_frontier(q, start);
 }
 
 static inline int qdio_inbound_q_done(struct qdio_q *q, unsigned int start)
@@ -535,22 +528,7 @@ static inline int qdio_inbound_q_done(struct qdio_q *q, unsigned int start)
 		/* more work coming */
 		return 0;
 
-	if (is_thinint_irq(q->irq_ptr))
-		return 1;
-
-	/* don't poll under z/VM */
-	if (MACHINE_IS_VM)
-		return 1;
-
-	/*
-	 * At this point we know, that inbound first_to_check
-	 * has (probably) not moved (see qdio_inbound_processing).
-	 */
-	if (get_tod_clock_fast() > q->u.in.timestamp + QDIO_INPUT_THRESHOLD) {
-		DBF_DEV_EVENT(DBF_INFO, q->irq_ptr, "in done:%02x", start);
-		return 1;
-	} else
-		return 0;
+	return 1;
 }
 
 static inline void qdio_handle_aobs(struct qdio_q *q, int start, int count)

From 3c5f2eb9695cd241c9898a01388b19a149d0b7d2 Mon Sep 17 00:00:00 2001
From: Heiko Carstens <hca@linux.ibm.com>
Date: Tue, 14 Jul 2020 07:46:40 +0200
Subject: [PATCH 361/502] s390/mm: avoid trimming to MAX_ORDER

Trimming to MAX_ORDER was originally done in order to avoid to set
HOLES_IN_ZONE, which in turn would enable a quite expensive
pfn_valid() check. pfn_valid() however only checks if a struct page
exists for a given pfn.

With sparsemen vmemmap there are always struct pages, since memmaps
are allocated for whole sections. Therefore remove the HOLES_IN_ZONE
comment and the trimming.

Signed-off-by: Heiko Carstens <hca@linux.ibm.com>
---
 arch/s390/kernel/setup.c | 8 --------
 1 file changed, 8 deletions(-)

diff --git a/arch/s390/kernel/setup.c b/arch/s390/kernel/setup.c
index 5853c9872dfe..295a02bab64d 100644
--- a/arch/s390/kernel/setup.c
+++ b/arch/s390/kernel/setup.c
@@ -1126,14 +1126,6 @@ void __init setup_arch(char **cmdline_p)
 	free_mem_detect_info();
 	remove_oldmem();
 
-	/*
-	 * Make sure all chunks are MAX_ORDER aligned so we don't need the
-	 * extra checks that HOLES_IN_ZONE would require.
-	 *
-	 * Is this still required?
-	 */
-	memblock_trim_memory(1UL << (MAX_ORDER - 1 + PAGE_SHIFT));
-
 	if (is_prot_virt_host())
 		setup_uv();
 	setup_memory_end();

From 771cf196cc92a6078656548bbc073aa932c053ab Mon Sep 17 00:00:00 2001
From: Heiko Carstens <hca@linux.ibm.com>
Date: Tue, 14 Jul 2020 08:22:21 +0200
Subject: [PATCH 362/502] s390/mm: allow order 10 allocations

Get rid of FORCE_MAX_ZONEORDER which limited allocations to order 8 (= 1MB)
and use the default, which allows for order 10 (= 4MB) allocations.

Given that s390 allows less than the default this caused some memory
allocation problems more or less unique to s390 from time to time.

Note: this was originally introduced with commit 684de39bd795 ("[S390]
Fix IPL from NSS.") in order to support Named Saved Segments, which
could start/end at an arbitrary 1 megabyte boundary and also before
support for sparsemem vmemmmap was enabled.

Since NSS support is gone, but sparsemem vmemmap support is available
this limitation can go away.

Signed-off-by: Heiko Carstens <hca@linux.ibm.com>
---
 arch/s390/Kconfig | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig
index 7697a1f8e819..0df33cffec52 100644
--- a/arch/s390/Kconfig
+++ b/arch/s390/Kconfig
@@ -625,10 +625,6 @@ config ARCH_ENABLE_MEMORY_HOTREMOVE
 config ARCH_ENABLE_SPLIT_PMD_PTLOCK
 	def_bool y
 
-config FORCE_MAX_ZONEORDER
-	int
-	default "9"
-
 config MAX_PHYSMEM_BITS
 	int "Maximum size of supported physical memory in bits (42-53)"
 	range 42 53

From 88aa8939c96781089e5ace3492d818074c5c6fe9 Mon Sep 17 00:00:00 2001
From: Ilya Leoshkevich <iii@linux.ibm.com>
Date: Mon, 29 Jun 2020 20:48:09 +0200
Subject: [PATCH 363/502] s390/kernel: unify EX_TABLE* implementations

Replace three implementations with one using using __stringify_in_c
macro conveniently "borrowed" from powerpc and microblaze.

Signed-off-by: Ilya Leoshkevich <iii@linux.ibm.com>
Signed-off-by: Heiko Carstens <hca@linux.ibm.com>
---
 arch/s390/include/asm/asm-const.h | 12 +++++++++++
 arch/s390/include/asm/linkage.h   | 34 ++++++++++---------------------
 2 files changed, 23 insertions(+), 23 deletions(-)
 create mode 100644 arch/s390/include/asm/asm-const.h

diff --git a/arch/s390/include/asm/asm-const.h b/arch/s390/include/asm/asm-const.h
new file mode 100644
index 000000000000..11f615eb0066
--- /dev/null
+++ b/arch/s390/include/asm/asm-const.h
@@ -0,0 +1,12 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_S390_ASM_CONST_H
+#define _ASM_S390_ASM_CONST_H
+
+#ifdef __ASSEMBLY__
+#  define stringify_in_c(...)	__VA_ARGS__
+#else
+/* This version of stringify will deal with commas... */
+#  define __stringify_in_c(...)	#__VA_ARGS__
+#  define stringify_in_c(...)	__stringify_in_c(__VA_ARGS__) " "
+#endif
+#endif /* _ASM_S390_ASM_CONST_H */
diff --git a/arch/s390/include/asm/linkage.h b/arch/s390/include/asm/linkage.h
index 7f22262b0e46..1b52c07b5642 100644
--- a/arch/s390/include/asm/linkage.h
+++ b/arch/s390/include/asm/linkage.h
@@ -2,38 +2,26 @@
 #ifndef __ASM_LINKAGE_H
 #define __ASM_LINKAGE_H
 
+#include <asm/asm-const.h>
 #include <linux/stringify.h>
 
 #define __ALIGN .align 4, 0x07
 #define __ALIGN_STR __stringify(__ALIGN)
 
-#ifndef __ASSEMBLY__
-
 /*
  * Helper macro for exception table entries
  */
-#define EX_TABLE(_fault, _target)	\
-	".section __ex_table,\"a\"\n"	\
-	".align	4\n"			\
-	".long	(" #_fault ") - .\n"	\
-	".long	(" #_target ") - .\n"	\
-	".previous\n"
 
-#else /* __ASSEMBLY__ */
+#define __EX_TABLE(_section, _fault, _target)				\
+	stringify_in_c(.section	_section,"a";)				\
+	stringify_in_c(.align	4;)					\
+	stringify_in_c(.long	(_fault) - .;)				\
+	stringify_in_c(.long	(_target) - .;)				\
+	stringify_in_c(.previous)
 
-#define EX_TABLE(_fault, _target)	\
-	.section __ex_table,"a"	;	\
-	.align	4 ;			\
-	.long	(_fault) - . ;		\
-	.long	(_target) - . ;		\
-	.previous
+#define EX_TABLE(_fault, _target)					\
+	__EX_TABLE(__ex_table, _fault, _target)
+#define EX_TABLE_DMA(_fault, _target)					\
+	__EX_TABLE(.dma.ex_table, _fault, _target)
 
-#define EX_TABLE_DMA(_fault, _target)	\
-	.section .dma.ex_table, "a" ;	\
-	.align	4 ;			\
-	.long	(_fault) - . ;		\
-	.long	(_target) - . ;		\
-	.previous
-
-#endif /* __ASSEMBLY__ */
 #endif

From 05a68e892e89c97df6650cd8cc55058002657cbc Mon Sep 17 00:00:00 2001
From: Ilya Leoshkevich <iii@linux.ibm.com>
Date: Tue, 30 Jun 2020 20:52:03 +0200
Subject: [PATCH 364/502] s390/kernel: expand exception table logic to allow
 new handling options

This is a s390 port of commit 548acf19234d ("x86/mm: Expand the
exception table logic to allow new handling options"), which is needed
for implementing BPF_PROBE_MEM on s390.

The new handler field is made 64-bit in order to allow pointing from
dynamically allocated entries to handlers in kernel text. Unlike on x86,
NULL is used instead of ex_handler_default. This is because exception
tables are used by boot/text_dma.S, and it would be a pain to preserve
ex_handler_default.

The new infrastructure is ignored in early_pgm_check_handler, since
there is no pt_regs.

Signed-off-by: Ilya Leoshkevich <iii@linux.ibm.com>
Reviewed-by: Heiko Carstens <heiko.carstens@de.ibm.com>
Signed-off-by: Heiko Carstens <hca@linux.ibm.com>
---
 arch/s390/include/asm/extable.h | 52 +++++++++++++++++++++++++++++----
 arch/s390/include/asm/linkage.h |  3 +-
 arch/s390/kernel/kprobes.c      |  4 +--
 arch/s390/kernel/traps.c        |  7 ++---
 arch/s390/mm/fault.c            |  4 +--
 scripts/sorttable.c             | 41 ++++++++++++++++++++++++++
 6 files changed, 94 insertions(+), 17 deletions(-)

diff --git a/arch/s390/include/asm/extable.h b/arch/s390/include/asm/extable.h
index ae27f756b409..3beb294fd553 100644
--- a/arch/s390/include/asm/extable.h
+++ b/arch/s390/include/asm/extable.h
@@ -1,12 +1,20 @@
 /* SPDX-License-Identifier: GPL-2.0 */
 #ifndef __S390_EXTABLE_H
 #define __S390_EXTABLE_H
+
+#include <asm/ptrace.h>
+#include <linux/compiler.h>
+
 /*
- * The exception table consists of pairs of addresses: the first is the
- * address of an instruction that is allowed to fault, and the second is
- * the address at which the program should continue.  No registers are
- * modified, so it is entirely up to the continuation code to figure out
- * what to do.
+ * The exception table consists of three addresses:
+ *
+ * - Address of an instruction that is allowed to fault.
+ * - Address at which the program should continue.
+ * - Optional address of handler that takes pt_regs * argument and runs in
+ *   interrupt context.
+ *
+ * No registers are modified, so it is entirely up to the continuation code
+ * to figure out what to do.
  *
  * All the routines below use bits of fixup code that are out of line
  * with the main instruction path.  This means when everything is well,
@@ -17,6 +25,7 @@
 struct exception_table_entry
 {
 	int insn, fixup;
+	long handler;
 };
 
 extern struct exception_table_entry *__start_dma_ex_table;
@@ -29,6 +38,39 @@ static inline unsigned long extable_fixup(const struct exception_table_entry *x)
 	return (unsigned long)&x->fixup + x->fixup;
 }
 
+typedef bool (*ex_handler_t)(const struct exception_table_entry *,
+			     struct pt_regs *);
+
+static inline ex_handler_t
+ex_fixup_handler(const struct exception_table_entry *x)
+{
+	if (likely(!x->handler))
+		return NULL;
+	return (ex_handler_t)((unsigned long)&x->handler + x->handler);
+}
+
+static inline bool ex_handle(const struct exception_table_entry *x,
+			     struct pt_regs *regs)
+{
+	ex_handler_t handler = ex_fixup_handler(x);
+
+	if (unlikely(handler))
+		return handler(x, regs);
+	regs->psw.addr = extable_fixup(x);
+	return true;
+}
+
 #define ARCH_HAS_RELATIVE_EXTABLE
 
+static inline void swap_ex_entry_fixup(struct exception_table_entry *a,
+				       struct exception_table_entry *b,
+				       struct exception_table_entry tmp,
+				       int delta)
+{
+	a->fixup = b->fixup + delta;
+	b->fixup = tmp.fixup - delta;
+	a->handler = b->handler + delta;
+	b->handler = tmp.handler - delta;
+}
+
 #endif
diff --git a/arch/s390/include/asm/linkage.h b/arch/s390/include/asm/linkage.h
index 1b52c07b5642..a0a7a2c72bd4 100644
--- a/arch/s390/include/asm/linkage.h
+++ b/arch/s390/include/asm/linkage.h
@@ -14,9 +14,10 @@
 
 #define __EX_TABLE(_section, _fault, _target)				\
 	stringify_in_c(.section	_section,"a";)				\
-	stringify_in_c(.align	4;)					\
+	stringify_in_c(.align	8;)					\
 	stringify_in_c(.long	(_fault) - .;)				\
 	stringify_in_c(.long	(_target) - .;)				\
+	stringify_in_c(.quad	0;)					\
 	stringify_in_c(.previous)
 
 #define EX_TABLE(_fault, _target)					\
diff --git a/arch/s390/kernel/kprobes.c b/arch/s390/kernel/kprobes.c
index 548d0ea9808d..d2a71d872638 100644
--- a/arch/s390/kernel/kprobes.c
+++ b/arch/s390/kernel/kprobes.c
@@ -523,10 +523,8 @@ static int kprobe_trap_handler(struct pt_regs *regs, int trapnr)
 		 * zero, try to fix up.
 		 */
 		entry = s390_search_extables(regs->psw.addr);
-		if (entry) {
-			regs->psw.addr = extable_fixup(entry);
+		if (entry && ex_handle(entry, regs))
 			return 1;
-		}
 
 		/*
 		 * fixup_exception() could not handle it,
diff --git a/arch/s390/kernel/traps.c b/arch/s390/kernel/traps.c
index ff9cc4c3290e..8d1e8a1a97df 100644
--- a/arch/s390/kernel/traps.c
+++ b/arch/s390/kernel/traps.c
@@ -50,11 +50,8 @@ void do_report_trap(struct pt_regs *regs, int si_signo, int si_code, char *str)
         } else {
                 const struct exception_table_entry *fixup;
 		fixup = s390_search_extables(regs->psw.addr);
-                if (fixup)
-			regs->psw.addr = extable_fixup(fixup);
-		else {
+		if (!fixup || !ex_handle(fixup, regs))
 			die(regs, str);
-		}
         }
 }
 
@@ -251,7 +248,7 @@ void monitor_event_exception(struct pt_regs *regs)
 	case BUG_TRAP_TYPE_NONE:
 		fixup = s390_search_extables(regs->psw.addr);
 		if (fixup)
-			regs->psw.addr = extable_fixup(fixup);
+			ex_handle(fixup, regs);
 		break;
 	case BUG_TRAP_TYPE_WARN:
 		break;
diff --git a/arch/s390/mm/fault.c b/arch/s390/mm/fault.c
index 598828517d9d..aebf9183bedd 100644
--- a/arch/s390/mm/fault.c
+++ b/arch/s390/mm/fault.c
@@ -255,10 +255,8 @@ static noinline void do_no_context(struct pt_regs *regs)
 
 	/* Are we prepared to handle this kernel fault?  */
 	fixup = s390_search_extables(regs->psw.addr);
-	if (fixup) {
-		regs->psw.addr = extable_fixup(fixup);
+	if (fixup && ex_handle(fixup, regs))
 		return;
-	}
 
 	/*
 	 * Oops. The kernel tried to access some bad page. We'll have to
diff --git a/scripts/sorttable.c b/scripts/sorttable.c
index ec6b5e81eba1..0ef3abfc4a51 100644
--- a/scripts/sorttable.c
+++ b/scripts/sorttable.c
@@ -255,6 +255,45 @@ static void x86_sort_relative_table(char *extab_image, int image_size)
 	}
 }
 
+static void s390_sort_relative_table(char *extab_image, int image_size)
+{
+	int i;
+
+	for (i = 0; i < image_size; i += 16) {
+		char *loc = extab_image + i;
+		uint64_t handler;
+
+		w(r((uint32_t *)loc) + i, (uint32_t *)loc);
+		w(r((uint32_t *)(loc + 4)) + (i + 4), (uint32_t *)(loc + 4));
+		/*
+		 * 0 is a special self-relative handler value, which means that
+		 * handler should be ignored. It is safe, because it means that
+		 * handler field points to itself, which should never happen.
+		 * When creating extable-relative values, keep it as 0, since
+		 * this should never occur either: it would mean that handler
+		 * field points to the first extable entry.
+		 */
+		handler = r8((uint64_t *)(loc + 8));
+		if (handler)
+			handler += i + 8;
+		w8(handler, (uint64_t *)(loc + 8));
+	}
+
+	qsort(extab_image, image_size / 16, 16, compare_relative_table);
+
+	for (i = 0; i < image_size; i += 16) {
+		char *loc = extab_image + i;
+		uint64_t handler;
+
+		w(r((uint32_t *)loc) - i, (uint32_t *)loc);
+		w(r((uint32_t *)(loc + 4)) - (i + 4), (uint32_t *)(loc + 4));
+		handler = r8((uint64_t *)(loc + 8));
+		if (handler)
+			handler -= i + 8;
+		w8(handler, (uint64_t *)(loc + 8));
+	}
+}
+
 static int do_file(char const *const fname, void *addr)
 {
 	int rc = -1;
@@ -297,6 +336,8 @@ static int do_file(char const *const fname, void *addr)
 		custom_sort = x86_sort_relative_table;
 		break;
 	case EM_S390:
+		custom_sort = s390_sort_relative_table;
+		break;
 	case EM_AARCH64:
 	case EM_PARISC:
 	case EM_PPC:

From 3f161e0ae863a0456d00e5a6c9c81098c62ab7fe Mon Sep 17 00:00:00 2001
From: Ilya Leoshkevich <iii@linux.ibm.com>
Date: Wed, 24 Jun 2020 14:55:22 +0200
Subject: [PATCH 365/502] s390/bpf: implement BPF_PROBE_MEM

This is a s390 port of x86 commit 3dec541b2e63 ("bpf: Add support for BTF
pointers to x86 JIT").

Signed-off-by: Ilya Leoshkevich <iii@linux.ibm.com>
Reviewed-by: Heiko Carstens <heiko.carstens@de.ibm.com>
Signed-off-by: Heiko Carstens <hca@linux.ibm.com>
---
 arch/s390/net/bpf_jit_comp.c | 139 ++++++++++++++++++++++++++++++++++-
 1 file changed, 138 insertions(+), 1 deletion(-)

diff --git a/arch/s390/net/bpf_jit_comp.c b/arch/s390/net/bpf_jit_comp.c
index f4242b894cf2..8fe7bdfc8d15 100644
--- a/arch/s390/net/bpf_jit_comp.c
+++ b/arch/s390/net/bpf_jit_comp.c
@@ -49,6 +49,7 @@ struct bpf_jit {
 	int r1_thunk_ip;	/* Address of expoline thunk for 'br %r1' */
 	int r14_thunk_ip;	/* Address of expoline thunk for 'br %r14' */
 	int tail_call_start;	/* Tail call start offset */
+	int excnt;		/* Number of exception table entries */
 	int labels[1];		/* Labels for local jumps */
 };
 
@@ -588,6 +589,84 @@ static void bpf_jit_epilogue(struct bpf_jit *jit, u32 stack_depth)
 	}
 }
 
+static int get_probe_mem_regno(const u8 *insn)
+{
+	/*
+	 * insn must point to llgc, llgh, llgf or lg, which have destination
+	 * register at the same position.
+	 */
+	if (insn[0] != 0xe3) /* common llgc, llgh, llgf and lg prefix */
+		return -1;
+	if (insn[5] != 0x90 && /* llgc */
+	    insn[5] != 0x91 && /* llgh */
+	    insn[5] != 0x16 && /* llgf */
+	    insn[5] != 0x04) /* lg */
+		return -1;
+	return insn[1] >> 4;
+}
+
+static bool ex_handler_bpf(const struct exception_table_entry *x,
+			   struct pt_regs *regs)
+{
+	int regno;
+	u8 *insn;
+
+	regs->psw.addr = extable_fixup(x);
+	insn = (u8 *)__rewind_psw(regs->psw, regs->int_code >> 16);
+	regno = get_probe_mem_regno(insn);
+	if (WARN_ON_ONCE(regno < 0))
+		/* JIT bug - unexpected instruction. */
+		return false;
+	regs->gprs[regno] = 0;
+	return true;
+}
+
+static int bpf_jit_probe_mem(struct bpf_jit *jit, struct bpf_prog *fp,
+			     int probe_prg, int nop_prg)
+{
+	struct exception_table_entry *ex;
+	s64 delta;
+	u8 *insn;
+	int prg;
+	int i;
+
+	if (!fp->aux->extable)
+		/* Do nothing during early JIT passes. */
+		return 0;
+	insn = jit->prg_buf + probe_prg;
+	if (WARN_ON_ONCE(get_probe_mem_regno(insn) < 0))
+		/* JIT bug - unexpected probe instruction. */
+		return -1;
+	if (WARN_ON_ONCE(probe_prg + insn_length(*insn) != nop_prg))
+		/* JIT bug - gap between probe and nop instructions. */
+		return -1;
+	for (i = 0; i < 2; i++) {
+		if (WARN_ON_ONCE(jit->excnt >= fp->aux->num_exentries))
+			/* Verifier bug - not enough entries. */
+			return -1;
+		ex = &fp->aux->extable[jit->excnt];
+		/* Add extable entries for probe and nop instructions. */
+		prg = i == 0 ? probe_prg : nop_prg;
+		delta = jit->prg_buf + prg - (u8 *)&ex->insn;
+		if (WARN_ON_ONCE(delta < INT_MIN || delta > INT_MAX))
+			/* JIT bug - code and extable must be close. */
+			return -1;
+		ex->insn = delta;
+		/*
+		 * Always land on the nop. Note that extable infrastructure
+		 * ignores fixup field, it is handled by ex_handler_bpf().
+		 */
+		delta = jit->prg_buf + nop_prg - (u8 *)&ex->fixup;
+		if (WARN_ON_ONCE(delta < INT_MIN || delta > INT_MAX))
+			/* JIT bug - landing pad and extable must be close. */
+			return -1;
+		ex->fixup = delta;
+		ex->handler = (u8 *)ex_handler_bpf - (u8 *)&ex->handler;
+		jit->excnt++;
+	}
+	return 0;
+}
+
 /*
  * Compile one eBPF instruction into s390x code
  *
@@ -604,7 +683,14 @@ static noinline int bpf_jit_insn(struct bpf_jit *jit, struct bpf_prog *fp,
 	u32 *addrs = jit->addrs;
 	s32 imm = insn->imm;
 	s16 off = insn->off;
+	int probe_prg = -1;
 	unsigned int mask;
+	int nop_prg;
+	int err;
+
+	if (BPF_CLASS(insn->code) == BPF_LDX &&
+	    BPF_MODE(insn->code) == BPF_PROBE_MEM)
+		probe_prg = jit->prg;
 
 	switch (insn->code) {
 	/*
@@ -1119,6 +1205,7 @@ static noinline int bpf_jit_insn(struct bpf_jit *jit, struct bpf_prog *fp,
 	 * BPF_LDX
 	 */
 	case BPF_LDX | BPF_MEM | BPF_B: /* dst = *(u8 *)(ul) (src + off) */
+	case BPF_LDX | BPF_PROBE_MEM | BPF_B:
 		/* llgc %dst,0(off,%src) */
 		EMIT6_DISP_LH(0xe3000000, 0x0090, dst_reg, src_reg, REG_0, off);
 		jit->seen |= SEEN_MEM;
@@ -1126,6 +1213,7 @@ static noinline int bpf_jit_insn(struct bpf_jit *jit, struct bpf_prog *fp,
 			insn_count = 2;
 		break;
 	case BPF_LDX | BPF_MEM | BPF_H: /* dst = *(u16 *)(ul) (src + off) */
+	case BPF_LDX | BPF_PROBE_MEM | BPF_H:
 		/* llgh %dst,0(off,%src) */
 		EMIT6_DISP_LH(0xe3000000, 0x0091, dst_reg, src_reg, REG_0, off);
 		jit->seen |= SEEN_MEM;
@@ -1133,6 +1221,7 @@ static noinline int bpf_jit_insn(struct bpf_jit *jit, struct bpf_prog *fp,
 			insn_count = 2;
 		break;
 	case BPF_LDX | BPF_MEM | BPF_W: /* dst = *(u32 *)(ul) (src + off) */
+	case BPF_LDX | BPF_PROBE_MEM | BPF_W:
 		/* llgf %dst,off(%src) */
 		jit->seen |= SEEN_MEM;
 		EMIT6_DISP_LH(0xe3000000, 0x0016, dst_reg, src_reg, REG_0, off);
@@ -1140,6 +1229,7 @@ static noinline int bpf_jit_insn(struct bpf_jit *jit, struct bpf_prog *fp,
 			insn_count = 2;
 		break;
 	case BPF_LDX | BPF_MEM | BPF_DW: /* dst = *(u64 *)(ul) (src + off) */
+	case BPF_LDX | BPF_PROBE_MEM | BPF_DW:
 		/* lg %dst,0(off,%src) */
 		jit->seen |= SEEN_MEM;
 		EMIT6_DISP_LH(0xe3000000, 0x0004, dst_reg, src_reg, REG_0, off);
@@ -1485,6 +1575,23 @@ branch_oc:
 		pr_err("Unknown opcode %02x\n", insn->code);
 		return -1;
 	}
+
+	if (probe_prg != -1) {
+		/*
+		 * Handlers of certain exceptions leave psw.addr pointing to
+		 * the instruction directly after the failing one. Therefore,
+		 * create two exception table entries and also add a nop in
+		 * case two probing instructions come directly after each
+		 * other.
+		 */
+		nop_prg = jit->prg;
+		/* bcr 0,%0 */
+		_EMIT2(0x0700);
+		err = bpf_jit_probe_mem(jit, fp, probe_prg, nop_prg);
+		if (err < 0)
+			return err;
+	}
+
 	return insn_count;
 }
 
@@ -1527,6 +1634,7 @@ static int bpf_jit_prog(struct bpf_jit *jit, struct bpf_prog *fp,
 	jit->lit32 = jit->lit32_start;
 	jit->lit64 = jit->lit64_start;
 	jit->prg = 0;
+	jit->excnt = 0;
 
 	bpf_jit_prologue(jit, stack_depth);
 	if (bpf_set_addr(jit, 0) < 0)
@@ -1551,6 +1659,12 @@ static int bpf_jit_prog(struct bpf_jit *jit, struct bpf_prog *fp,
 		jit->lit64_start = ALIGN(jit->lit64_start, 8);
 	jit->size = jit->lit64_start + lit64_size;
 	jit->size_prg = jit->prg;
+
+	if (WARN_ON_ONCE(fp->aux->extable &&
+			 jit->excnt != fp->aux->num_exentries))
+		/* Verifier bug - too many entries. */
+		return -1;
+
 	return 0;
 }
 
@@ -1565,6 +1679,29 @@ struct s390_jit_data {
 	int pass;
 };
 
+static struct bpf_binary_header *bpf_jit_alloc(struct bpf_jit *jit,
+					       struct bpf_prog *fp)
+{
+	struct bpf_binary_header *header;
+	u32 extable_size;
+	u32 code_size;
+
+	/* We need two entries per insn. */
+	fp->aux->num_exentries *= 2;
+
+	code_size = roundup(jit->size,
+			    __alignof__(struct exception_table_entry));
+	extable_size = fp->aux->num_exentries *
+		sizeof(struct exception_table_entry);
+	header = bpf_jit_binary_alloc(code_size + extable_size, &jit->prg_buf,
+				      8, jit_fill_hole);
+	if (!header)
+		return NULL;
+	fp->aux->extable = (struct exception_table_entry *)
+		(jit->prg_buf + code_size);
+	return header;
+}
+
 /*
  * Compile eBPF program "fp"
  */
@@ -1631,7 +1768,7 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *fp)
 	/*
 	 * Final pass: Allocate and generate program
 	 */
-	header = bpf_jit_binary_alloc(jit.size, &jit.prg_buf, 8, jit_fill_hole);
+	header = bpf_jit_alloc(&jit, fp);
 	if (!header) {
 		fp = orig_fp;
 		goto free_addrs;

From 539707caa1a89ee4efc57b4e4231c20c46575ccc Mon Sep 17 00:00:00 2001
From: Shaokun Zhang <zhangshaokun@hisilicon.com>
Date: Thu, 18 Jun 2020 21:35:44 +0800
Subject: [PATCH 366/502] arm64: perf: Correct the event index in sysfs

When PMU event ID is equal or greater than 0x4000, it will be reduced
by 0x4000 and it is not the raw number in the sysfs. Let's correct it
and obtain the raw event ID.

Before this patch:
cat /sys/bus/event_source/devices/armv8_pmuv3_0/events/sample_feed
event=0x001
After this patch:
cat /sys/bus/event_source/devices/armv8_pmuv3_0/events/sample_feed
event=0x4001

Signed-off-by: Shaokun Zhang <zhangshaokun@hisilicon.com>
Cc: Will Deacon <will@kernel.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: <stable@vger.kernel.org>
Link: https://lore.kernel.org/r/1592487344-30555-3-git-send-email-zhangshaokun@hisilicon.com
[will: fixed formatting of 'if' condition]
Signed-off-by: Will Deacon <will@kernel.org>
---
 arch/arm64/kernel/perf_event.c | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/arch/arm64/kernel/perf_event.c b/arch/arm64/kernel/perf_event.c
index 4d7879484cec..581602413a13 100644
--- a/arch/arm64/kernel/perf_event.c
+++ b/arch/arm64/kernel/perf_event.c
@@ -155,7 +155,7 @@ armv8pmu_events_sysfs_show(struct device *dev,
 
 	pmu_attr = container_of(attr, struct perf_pmu_events_attr, attr);
 
-	return sprintf(page, "event=0x%03llx\n", pmu_attr->id);
+	return sprintf(page, "event=0x%04llx\n", pmu_attr->id);
 }
 
 #define ARMV8_EVENT_ATTR(name, config)						\
@@ -244,10 +244,13 @@ armv8pmu_event_attr_is_visible(struct kobject *kobj,
 	    test_bit(pmu_attr->id, cpu_pmu->pmceid_bitmap))
 		return attr->mode;
 
-	pmu_attr->id -= ARMV8_PMUV3_EXT_COMMON_EVENT_BASE;
-	if (pmu_attr->id < ARMV8_PMUV3_MAX_COMMON_EVENTS &&
-	    test_bit(pmu_attr->id, cpu_pmu->pmceid_ext_bitmap))
-		return attr->mode;
+	if (pmu_attr->id >= ARMV8_PMUV3_EXT_COMMON_EVENT_BASE) {
+		u64 id = pmu_attr->id - ARMV8_PMUV3_EXT_COMMON_EVENT_BASE;
+
+		if (id < ARMV8_PMUV3_MAX_COMMON_EVENTS &&
+		    test_bit(id, cpu_pmu->pmceid_ext_bitmap))
+			return attr->mode;
+	}
 
 	return 0;
 }

From 1b86abc1c645ad5c9c7bf70910cb3ce73939d2d7 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Thu, 16 Jul 2020 13:11:24 +0800
Subject: [PATCH 367/502] sched_clock: Expose struct clock_read_data

In order to support perf_event_mmap_page::cap_time features, an
architecture needs, aside from a userspace readable counter register,
to expose the exact clock data so that userspace can convert the
counter register into a correct timestamp.

Provide struct clock_read_data and two (seqcount) helpers so that
architectures (arm64 in specific) can expose the numbers to userspace.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Leo Yan <leo.yan@linaro.org>
Link: https://lore.kernel.org/r/20200716051130.4359-2-leo.yan@linaro.org
Signed-off-by: Will Deacon <will@kernel.org>
---
 include/linux/sched_clock.h | 28 +++++++++++++++++++++++++
 kernel/time/sched_clock.c   | 41 ++++++++++++-------------------------
 2 files changed, 41 insertions(+), 28 deletions(-)

diff --git a/include/linux/sched_clock.h b/include/linux/sched_clock.h
index 0bb04a96a6d4..528718e4ed52 100644
--- a/include/linux/sched_clock.h
+++ b/include/linux/sched_clock.h
@@ -6,6 +6,34 @@
 #define LINUX_SCHED_CLOCK
 
 #ifdef CONFIG_GENERIC_SCHED_CLOCK
+/**
+ * struct clock_read_data - data required to read from sched_clock()
+ *
+ * @epoch_ns:		sched_clock() value at last update
+ * @epoch_cyc:		Clock cycle value at last update.
+ * @sched_clock_mask:   Bitmask for two's complement subtraction of non 64bit
+ *			clocks.
+ * @read_sched_clock:	Current clock source (or dummy source when suspended).
+ * @mult:		Multipler for scaled math conversion.
+ * @shift:		Shift value for scaled math conversion.
+ *
+ * Care must be taken when updating this structure; it is read by
+ * some very hot code paths. It occupies <=40 bytes and, when combined
+ * with the seqcount used to synchronize access, comfortably fits into
+ * a 64 byte cache line.
+ */
+struct clock_read_data {
+	u64 epoch_ns;
+	u64 epoch_cyc;
+	u64 sched_clock_mask;
+	u64 (*read_sched_clock)(void);
+	u32 mult;
+	u32 shift;
+};
+
+extern struct clock_read_data *sched_clock_read_begin(unsigned int *seq);
+extern int sched_clock_read_retry(unsigned int seq);
+
 extern void generic_sched_clock_init(void);
 
 extern void sched_clock_register(u64 (*read)(void), int bits,
diff --git a/kernel/time/sched_clock.c b/kernel/time/sched_clock.c
index fa3f800d7d76..0acaadc3156c 100644
--- a/kernel/time/sched_clock.c
+++ b/kernel/time/sched_clock.c
@@ -19,31 +19,6 @@
 
 #include "timekeeping.h"
 
-/**
- * struct clock_read_data - data required to read from sched_clock()
- *
- * @epoch_ns:		sched_clock() value at last update
- * @epoch_cyc:		Clock cycle value at last update.
- * @sched_clock_mask:   Bitmask for two's complement subtraction of non 64bit
- *			clocks.
- * @read_sched_clock:	Current clock source (or dummy source when suspended).
- * @mult:		Multipler for scaled math conversion.
- * @shift:		Shift value for scaled math conversion.
- *
- * Care must be taken when updating this structure; it is read by
- * some very hot code paths. It occupies <=40 bytes and, when combined
- * with the seqcount used to synchronize access, comfortably fits into
- * a 64 byte cache line.
- */
-struct clock_read_data {
-	u64 epoch_ns;
-	u64 epoch_cyc;
-	u64 sched_clock_mask;
-	u64 (*read_sched_clock)(void);
-	u32 mult;
-	u32 shift;
-};
-
 /**
  * struct clock_data - all data needed for sched_clock() (including
  *                     registration of a new clock source)
@@ -93,6 +68,17 @@ static inline u64 notrace cyc_to_ns(u64 cyc, u32 mult, u32 shift)
 	return (cyc * mult) >> shift;
 }
 
+struct clock_read_data *sched_clock_read_begin(unsigned int *seq)
+{
+	*seq = raw_read_seqcount(&cd.seq);
+	return cd.read_data + (*seq & 1);
+}
+
+int sched_clock_read_retry(unsigned int seq)
+{
+	return read_seqcount_retry(&cd.seq, seq);
+}
+
 unsigned long long notrace sched_clock(void)
 {
 	u64 cyc, res;
@@ -100,13 +86,12 @@ unsigned long long notrace sched_clock(void)
 	struct clock_read_data *rd;
 
 	do {
-		seq = raw_read_seqcount(&cd.seq);
-		rd = cd.read_data + (seq & 1);
+		rd = sched_clock_read_begin(&seq);
 
 		cyc = (rd->read_sched_clock() - rd->epoch_cyc) &
 		      rd->sched_clock_mask;
 		res = rd->epoch_ns + cyc_to_ns(cyc, rd->mult, rd->shift);
-	} while (read_seqcount_retry(&cd.seq, seq));
+	} while (sched_clock_read_retry(seq));
 
 	return res;
 }

From aadd6e5caaacd6feca9691ba30536e7de5a7d152 Mon Sep 17 00:00:00 2001
From: "Ahmed S. Darwish" <a.darwish@linutronix.de>
Date: Thu, 16 Jul 2020 13:11:25 +0800
Subject: [PATCH 368/502] time/sched_clock: Use raw_read_seqcount_latch()

sched_clock uses seqcount_t latching to switch between two storage
places protected by the sequence counter. This allows it to have
interruptible, NMI-safe, seqcount_t write side critical sections.

Since 7fc26327b756 ("seqlock: Introduce raw_read_seqcount_latch()"),
raw_read_seqcount_latch() became the standardized way for seqcount_t
latch read paths. Due to the dependent load, it also has one read
memory barrier less than the currently used raw_read_seqcount() API.

Use raw_read_seqcount_latch() for the seqcount_t latch read path.

Signed-off-by: Ahmed S. Darwish <a.darwish@linutronix.de>
Signed-off-by: Leo Yan <leo.yan@linaro.org>
Link: https://lkml.kernel.org/r/20200625085745.GD117543@hirez.programming.kicks-ass.net
Link: https://lkml.kernel.org/r/20200715092345.GA231464@debian-buster-darwi.lab.linutronix.de
Link: https://lore.kernel.org/r/20200716051130.4359-3-leo.yan@linaro.org
References: 1809bfa44e10 ("timers, sched/clock: Avoid deadlock during read from NMI")
Signed-off-by: Will Deacon <will@kernel.org>
---
 kernel/time/sched_clock.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/time/sched_clock.c b/kernel/time/sched_clock.c
index 0acaadc3156c..0deaf4b79fb4 100644
--- a/kernel/time/sched_clock.c
+++ b/kernel/time/sched_clock.c
@@ -70,7 +70,7 @@ static inline u64 notrace cyc_to_ns(u64 cyc, u32 mult, u32 shift)
 
 struct clock_read_data *sched_clock_read_begin(unsigned int *seq)
 {
-	*seq = raw_read_seqcount(&cd.seq);
+	*seq = raw_read_seqcount_latch(&cd.seq);
 	return cd.read_data + (*seq & 1);
 }
 

From 950b74ddefc4a42add8b1ae0170aa309338ffe73 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Thu, 16 Jul 2020 13:11:26 +0800
Subject: [PATCH 369/502] arm64: perf: Implement correct cap_user_time

As reported by Leo; the existing implementation is broken when the
clock and counter don't intersect at 0.

Use the sched_clock's struct clock_read_data information to correctly
implement cap_user_time and cap_user_time_zero.

Note that the ARM64 counter is architecturally only guaranteed to be
56bit wide (implementations are allowed to be wider) and the existing
perf ABI cannot deal with wrap-around.

This implementation should also be faster than the old; seeing how we
don't need to recompute mult and shift all the time.

[leoyan: Use mul_u64_u32_shr() to convert cyc to ns to avoid overflow]

Reported-by: Leo Yan <leo.yan@linaro.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Leo Yan <leo.yan@linaro.org>
Link: https://lore.kernel.org/r/20200716051130.4359-4-leo.yan@linaro.org
Signed-off-by: Will Deacon <will@kernel.org>
---
 arch/arm64/kernel/perf_event.c | 38 ++++++++++++++++++++++++++--------
 1 file changed, 29 insertions(+), 9 deletions(-)

diff --git a/arch/arm64/kernel/perf_event.c b/arch/arm64/kernel/perf_event.c
index 581602413a13..3bbbc22a5148 100644
--- a/arch/arm64/kernel/perf_event.c
+++ b/arch/arm64/kernel/perf_event.c
@@ -19,6 +19,7 @@
 #include <linux/of.h>
 #include <linux/perf/arm_pmu.h>
 #include <linux/platform_device.h>
+#include <linux/sched_clock.h>
 #include <linux/smp.h>
 
 /* ARMv8 Cortex-A53 specific event types. */
@@ -1168,28 +1169,47 @@ device_initcall(armv8_pmu_driver_init)
 void arch_perf_update_userpage(struct perf_event *event,
 			       struct perf_event_mmap_page *userpg, u64 now)
 {
-	u32 freq;
-	u32 shift;
+	struct clock_read_data *rd;
+	unsigned int seq;
+	u64 ns;
 
 	/*
 	 * Internal timekeeping for enabled/running/stopped times
 	 * is always computed with the sched_clock.
 	 */
-	freq = arch_timer_get_rate();
 	userpg->cap_user_time = 1;
+	userpg->cap_user_time_zero = 1;
+
+	do {
+		rd = sched_clock_read_begin(&seq);
+
+		userpg->time_mult = rd->mult;
+		userpg->time_shift = rd->shift;
+		userpg->time_zero = rd->epoch_ns;
+
+		/*
+		 * This isn't strictly correct, the ARM64 counter can be
+		 * 'short' and then we get funnies when it wraps. The correct
+		 * thing would be to extend the perf ABI with a cycle and mask
+		 * value, but because wrapping on ARM64 is very rare in
+		 * practise this 'works'.
+		 */
+		ns = mul_u64_u32_shr(rd->epoch_cyc, rd->mult, rd->shift);
+		userpg->time_zero -= ns;
+
+	} while (sched_clock_read_retry(seq));
+
+	userpg->time_offset = userpg->time_zero - now;
 
-	clocks_calc_mult_shift(&userpg->time_mult, &shift, freq,
-			NSEC_PER_SEC, 0);
 	/*
 	 * time_shift is not expected to be greater than 31 due to
 	 * the original published conversion algorithm shifting a
 	 * 32-bit value (now specifies a 64-bit value) - refer
 	 * perf_event_mmap_page documentation in perf_event.h.
 	 */
-	if (shift == 32) {
-		shift = 31;
+	if (userpg->time_shift == 32) {
+		userpg->time_shift = 31;
 		userpg->time_mult >>= 1;
 	}
-	userpg->time_shift = (u16)shift;
-	userpg->time_offset = -now;
+
 }

From 279a811eb520594fac3cd3a541e6c7ea50072ac9 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Thu, 16 Jul 2020 13:11:27 +0800
Subject: [PATCH 370/502] arm64: perf: Only advertise cap_user_time for
 arch_timer

When sched_clock is running on anything other than arch_timer, don't
advertise cap_user_time*.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Leo Yan <leo.yan@linaro.org>
Link: https://lore.kernel.org/r/20200716051130.4359-5-leo.yan@linaro.org
Requested-by: Will Deacon <will@kernel.org>
Signed-off-by: Will Deacon <will@kernel.org>
---
 arch/arm64/kernel/perf_event.c | 19 +++++++++++++------
 1 file changed, 13 insertions(+), 6 deletions(-)

diff --git a/arch/arm64/kernel/perf_event.c b/arch/arm64/kernel/perf_event.c
index 3bbbc22a5148..674edc7ba8ca 100644
--- a/arch/arm64/kernel/perf_event.c
+++ b/arch/arm64/kernel/perf_event.c
@@ -13,6 +13,8 @@
 #include <asm/sysreg.h>
 #include <asm/virt.h>
 
+#include <clocksource/arm_arch_timer.h>
+
 #include <linux/acpi.h>
 #include <linux/clocksource.h>
 #include <linux/kvm_host.h>
@@ -1173,16 +1175,15 @@ void arch_perf_update_userpage(struct perf_event *event,
 	unsigned int seq;
 	u64 ns;
 
-	/*
-	 * Internal timekeeping for enabled/running/stopped times
-	 * is always computed with the sched_clock.
-	 */
-	userpg->cap_user_time = 1;
-	userpg->cap_user_time_zero = 1;
+	userpg->cap_user_time = 0;
+	userpg->cap_user_time_zero = 0;
 
 	do {
 		rd = sched_clock_read_begin(&seq);
 
+		if (rd->read_sched_clock != arch_timer_read_counter)
+			return;
+
 		userpg->time_mult = rd->mult;
 		userpg->time_shift = rd->shift;
 		userpg->time_zero = rd->epoch_ns;
@@ -1212,4 +1213,10 @@ void arch_perf_update_userpage(struct perf_event *event,
 		userpg->time_mult >>= 1;
 	}
 
+	/*
+	 * Internal timekeeping for enabled/running/stopped times
+	 * is always computed with the sched_clock.
+	 */
+	userpg->cap_user_time = 1;
+	userpg->cap_user_time_zero = 1;
 }

From 6c0246a4588d418f72acd40a7b7601be403d80a9 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Thu, 16 Jul 2020 13:11:28 +0800
Subject: [PATCH 371/502] perf: Add perf_event_mmap_page::cap_user_time_short
 ABI

In order to support short clock counters, provide an ABI extension.

As a whole:

    u64 time, delta, cyc = read_cycle_counter();

+   if (cap_user_time_short)
+	cyc = time_cycle + ((cyc - time_cycle) & time_mask);

    delta = mul_u64_u32_shr(cyc, time_mult, time_shift);

    if (cap_user_time_zero)
	time = time_zero + delta;

    delta += time_offset;

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Leo Yan <leo.yan@linaro.org>
Link: https://lore.kernel.org/r/20200716051130.4359-6-leo.yan@linaro.org
Signed-off-by: Will Deacon <will@kernel.org>
---
 include/uapi/linux/perf_event.h | 23 ++++++++++++++++++++---
 1 file changed, 20 insertions(+), 3 deletions(-)

diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h
index 7b2d6fc9e6ed..21a1edd08cbe 100644
--- a/include/uapi/linux/perf_event.h
+++ b/include/uapi/linux/perf_event.h
@@ -532,9 +532,10 @@ struct perf_event_mmap_page {
 				cap_bit0_is_deprecated	: 1, /* Always 1, signals that bit 0 is zero */
 
 				cap_user_rdpmc		: 1, /* The RDPMC instruction can be used to read counts */
-				cap_user_time		: 1, /* The time_* fields are used */
+				cap_user_time		: 1, /* The time_{shift,mult,offset} fields are used */
 				cap_user_time_zero	: 1, /* The time_zero field is used */
-				cap_____res		: 59;
+				cap_user_time_short	: 1, /* the time_{cycle,mask} fields are used */
+				cap_____res		: 58;
 		};
 	};
 
@@ -593,13 +594,29 @@ struct perf_event_mmap_page {
 	 *               ((rem * time_mult) >> time_shift);
 	 */
 	__u64	time_zero;
+
 	__u32	size;			/* Header size up to __reserved[] fields. */
+	__u32	__reserved_1;
+
+	/*
+	 * If cap_usr_time_short, the hardware clock is less than 64bit wide
+	 * and we must compute the 'cyc' value, as used by cap_usr_time, as:
+	 *
+	 *   cyc = time_cycles + ((cyc - time_cycles) & time_mask)
+	 *
+	 * NOTE: this form is explicitly chosen such that cap_usr_time_short
+	 *       is a correction on top of cap_usr_time, and code that doesn't
+	 *       know about cap_usr_time_short still works under the assumption
+	 *       the counter doesn't wrap.
+	 */
+	__u64	time_cycles;
+	__u64	time_mask;
 
 		/*
 		 * Hole for extension of the self monitor capabilities
 		 */
 
-	__u8	__reserved[118*8+4];	/* align to 1k. */
+	__u8	__reserved[116*8];	/* align to 1k. */
 
 	/*
 	 * Control data for the mmap() data buffer.

From c8f9eb0d6ebaa768c9f6eb2ee21b01d74230934d Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Thu, 16 Jul 2020 13:11:29 +0800
Subject: [PATCH 372/502] arm64: perf: Add cap_user_time_short

This completes the ARM64 cap_user_time support.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Leo Yan <leo.yan@linaro.org>
Link: https://lore.kernel.org/r/20200716051130.4359-7-leo.yan@linaro.org
Signed-off-by: Will Deacon <will@kernel.org>
---
 arch/arm64/kernel/perf_event.c | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/arch/arm64/kernel/perf_event.c b/arch/arm64/kernel/perf_event.c
index 674edc7ba8ca..fdb6029c9021 100644
--- a/arch/arm64/kernel/perf_event.c
+++ b/arch/arm64/kernel/perf_event.c
@@ -1177,6 +1177,7 @@ void arch_perf_update_userpage(struct perf_event *event,
 
 	userpg->cap_user_time = 0;
 	userpg->cap_user_time_zero = 0;
+	userpg->cap_user_time_short = 0;
 
 	do {
 		rd = sched_clock_read_begin(&seq);
@@ -1187,13 +1188,13 @@ void arch_perf_update_userpage(struct perf_event *event,
 		userpg->time_mult = rd->mult;
 		userpg->time_shift = rd->shift;
 		userpg->time_zero = rd->epoch_ns;
+		userpg->time_cycles = rd->epoch_cyc;
+		userpg->time_mask = rd->sched_clock_mask;
 
 		/*
-		 * This isn't strictly correct, the ARM64 counter can be
-		 * 'short' and then we get funnies when it wraps. The correct
-		 * thing would be to extend the perf ABI with a cycle and mask
-		 * value, but because wrapping on ARM64 is very rare in
-		 * practise this 'works'.
+		 * Subtract the cycle base, such that software that
+		 * doesn't know about cap_user_time_short still 'works'
+		 * assuming no wraps.
 		 */
 		ns = mul_u64_u32_shr(rd->epoch_cyc, rd->mult, rd->shift);
 		userpg->time_zero -= ns;
@@ -1219,4 +1220,5 @@ void arch_perf_update_userpage(struct perf_event *event,
 	 */
 	userpg->cap_user_time = 1;
 	userpg->cap_user_time_zero = 1;
+	userpg->cap_user_time_short = 1;
 }

From 5271d915a99c696a2f16ae59cf6a037be35afa22 Mon Sep 17 00:00:00 2001
From: Leo Yan <leo.yan@linaro.org>
Date: Thu, 16 Jul 2020 13:11:30 +0800
Subject: [PATCH 373/502] tools headers UAPI: Update tools's copy of
 linux/perf_event.h

To get the changes in the commit:

  "perf: Add perf_event_mmap_page::cap_user_time_short ABI"

This update is a prerequisite to add support for short clock counters
related ABI extension.

Signed-off-by: Leo Yan <leo.yan@linaro.org>
Link: https://lore.kernel.org/r/20200716051130.4359-8-leo.yan@linaro.org
Signed-off-by: Will Deacon <will@kernel.org>
---
 tools/include/uapi/linux/perf_event.h | 23 ++++++++++++++++++++---
 1 file changed, 20 insertions(+), 3 deletions(-)

diff --git a/tools/include/uapi/linux/perf_event.h b/tools/include/uapi/linux/perf_event.h
index 7b2d6fc9e6ed..21a1edd08cbe 100644
--- a/tools/include/uapi/linux/perf_event.h
+++ b/tools/include/uapi/linux/perf_event.h
@@ -532,9 +532,10 @@ struct perf_event_mmap_page {
 				cap_bit0_is_deprecated	: 1, /* Always 1, signals that bit 0 is zero */
 
 				cap_user_rdpmc		: 1, /* The RDPMC instruction can be used to read counts */
-				cap_user_time		: 1, /* The time_* fields are used */
+				cap_user_time		: 1, /* The time_{shift,mult,offset} fields are used */
 				cap_user_time_zero	: 1, /* The time_zero field is used */
-				cap_____res		: 59;
+				cap_user_time_short	: 1, /* the time_{cycle,mask} fields are used */
+				cap_____res		: 58;
 		};
 	};
 
@@ -593,13 +594,29 @@ struct perf_event_mmap_page {
 	 *               ((rem * time_mult) >> time_shift);
 	 */
 	__u64	time_zero;
+
 	__u32	size;			/* Header size up to __reserved[] fields. */
+	__u32	__reserved_1;
+
+	/*
+	 * If cap_usr_time_short, the hardware clock is less than 64bit wide
+	 * and we must compute the 'cyc' value, as used by cap_usr_time, as:
+	 *
+	 *   cyc = time_cycles + ((cyc - time_cycles) & time_mask)
+	 *
+	 * NOTE: this form is explicitly chosen such that cap_usr_time_short
+	 *       is a correction on top of cap_usr_time, and code that doesn't
+	 *       know about cap_usr_time_short still works under the assumption
+	 *       the counter doesn't wrap.
+	 */
+	__u64	time_cycles;
+	__u64	time_mask;
 
 		/*
 		 * Hole for extension of the self monitor capabilities
 		 */
 
-	__u8	__reserved[118*8+4];	/* align to 1k. */
+	__u8	__reserved[116*8];	/* align to 1k. */
 
 	/*
 	 * Control data for the mmap() data buffer.

From f143c11bb7b924403ea2d5b5c990717772293620 Mon Sep 17 00:00:00 2001
From: Will Deacon <will@kernel.org>
Date: Thu, 21 Nov 2019 12:41:40 +0000
Subject: [PATCH 374/502] tools: bpf: Use local copy of headers including
 uapi/linux/filter.h

Pulling header files directly out of the kernel sources for inclusion in
userspace programs is highly error prone, not least because it bypasses
the kbuild infrastructure entirely and so may end up referencing other
header files that have not been generated.

Subsequent patches will cause compiler.h to pull in the ungenerated
asm/rwonce.h file via filter.h, breaking the build for tools/bpf:

  | $ make -C tools/bpf
  | make: Entering directory '/linux/tools/bpf'
  |   CC       bpf_jit_disasm.o
  |   LINK     bpf_jit_disasm
  |   CC       bpf_dbg.o
  | In file included from /linux/include/uapi/linux/filter.h:9,
  |                  from /linux/tools/bpf/bpf_dbg.c:41:
  | /linux/include/linux/compiler.h:247:10: fatal error: asm/rwonce.h: No such file or directory
  |  #include <asm/rwonce.h>
  |           ^~~~~~~~~~~~~~
  | compilation terminated.
  | make: *** [Makefile:61: bpf_dbg.o] Error 1
  | make: Leaving directory '/linux/tools/bpf'

Take a copy of the installed version of linux/filter.h  (i.e. the one
created by the 'headers_install' target) into tools/include/uapi/linux/
and adjust the BPF tool Makefile to reference the local include
directories instead of those in the main source tree.

Cc: Masahiro Yamada <yamada.masahiro@socionext.com>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Suggested-by: Daniel Borkmann <daniel@iogearbox.net>
Reported-by: Xiao Yang <ice_yangxiao@163.com>
Signed-off-by: Will Deacon <will@kernel.org>
---
 tools/bpf/Makefile                |  3 +-
 tools/include/uapi/linux/filter.h | 90 +++++++++++++++++++++++++++++++
 2 files changed, 92 insertions(+), 1 deletion(-)
 create mode 100644 tools/include/uapi/linux/filter.h

diff --git a/tools/bpf/Makefile b/tools/bpf/Makefile
index 6df1850f8353..8a69258fd8aa 100644
--- a/tools/bpf/Makefile
+++ b/tools/bpf/Makefile
@@ -9,7 +9,8 @@ MAKE = make
 INSTALL ?= install
 
 CFLAGS += -Wall -O2
-CFLAGS += -D__EXPORTED_HEADERS__ -I$(srctree)/include/uapi -I$(srctree)/include
+CFLAGS += -D__EXPORTED_HEADERS__ -I$(srctree)/tools/include/uapi \
+	  -I$(srctree)/tools/include
 
 # This will work when bpf is built in tools env. where srctree
 # isn't set and when invoked from selftests build, where srctree
diff --git a/tools/include/uapi/linux/filter.h b/tools/include/uapi/linux/filter.h
new file mode 100644
index 000000000000..eaef459e7bd4
--- /dev/null
+++ b/tools/include/uapi/linux/filter.h
@@ -0,0 +1,90 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+/*
+ * Linux Socket Filter Data Structures
+ */
+
+#ifndef __LINUX_FILTER_H__
+#define __LINUX_FILTER_H__
+
+
+#include <linux/types.h>
+#include <linux/bpf_common.h>
+
+/*
+ * Current version of the filter code architecture.
+ */
+#define BPF_MAJOR_VERSION 1
+#define BPF_MINOR_VERSION 1
+
+/*
+ *	Try and keep these values and structures similar to BSD, especially
+ *	the BPF code definitions which need to match so you can share filters
+ */
+ 
+struct sock_filter {	/* Filter block */
+	__u16	code;   /* Actual filter code */
+	__u8	jt;	/* Jump true */
+	__u8	jf;	/* Jump false */
+	__u32	k;      /* Generic multiuse field */
+};
+
+struct sock_fprog {	/* Required for SO_ATTACH_FILTER. */
+	unsigned short		len;	/* Number of filter blocks */
+	struct sock_filter *filter;
+};
+
+/* ret - BPF_K and BPF_X also apply */
+#define BPF_RVAL(code)  ((code) & 0x18)
+#define         BPF_A           0x10
+
+/* misc */
+#define BPF_MISCOP(code) ((code) & 0xf8)
+#define         BPF_TAX         0x00
+#define         BPF_TXA         0x80
+
+/*
+ * Macros for filter block array initializers.
+ */
+#ifndef BPF_STMT
+#define BPF_STMT(code, k) { (unsigned short)(code), 0, 0, k }
+#endif
+#ifndef BPF_JUMP
+#define BPF_JUMP(code, k, jt, jf) { (unsigned short)(code), jt, jf, k }
+#endif
+
+/*
+ * Number of scratch memory words for: BPF_ST and BPF_STX
+ */
+#define BPF_MEMWORDS 16
+
+/* RATIONALE. Negative offsets are invalid in BPF.
+   We use them to reference ancillary data.
+   Unlike introduction new instructions, it does not break
+   existing compilers/optimizers.
+ */
+#define SKF_AD_OFF    (-0x1000)
+#define SKF_AD_PROTOCOL 0
+#define SKF_AD_PKTTYPE 	4
+#define SKF_AD_IFINDEX 	8
+#define SKF_AD_NLATTR	12
+#define SKF_AD_NLATTR_NEST	16
+#define SKF_AD_MARK 	20
+#define SKF_AD_QUEUE	24
+#define SKF_AD_HATYPE	28
+#define SKF_AD_RXHASH	32
+#define SKF_AD_CPU	36
+#define SKF_AD_ALU_XOR_X	40
+#define SKF_AD_VLAN_TAG	44
+#define SKF_AD_VLAN_TAG_PRESENT 48
+#define SKF_AD_PAY_OFFSET	52
+#define SKF_AD_RANDOM	56
+#define SKF_AD_VLAN_TPID	60
+#define SKF_AD_MAX	64
+
+#define SKF_NET_OFF	(-0x100000)
+#define SKF_LL_OFF	(-0x200000)
+
+#define BPF_NET_OFF	SKF_NET_OFF
+#define BPF_LL_OFF	SKF_LL_OFF
+
+#endif /* __LINUX_FILTER_H__ */

From e506ea451254ab17e0bf918ca36232fec2a9b10c Mon Sep 17 00:00:00 2001
From: Will Deacon <will@kernel.org>
Date: Tue, 15 Oct 2019 16:29:32 -0700
Subject: [PATCH 375/502] compiler.h: Split {READ,WRITE}_ONCE definitions out
 into rwonce.h

In preparation for allowing architectures to define their own
implementation of the READ_ONCE() macro, move the generic
{READ,WRITE}_ONCE() definitions out of the unwieldy 'linux/compiler.h'
file and into a new 'rwonce.h' header under 'asm-generic'.

Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Paul E. McKenney <paulmck@kernel.org>
Signed-off-by: Will Deacon <will@kernel.org>
---
 include/asm-generic/Kbuild    |   1 +
 include/asm-generic/barrier.h |   2 +-
 include/asm-generic/rwonce.h  | 101 ++++++++++++++++++++++++++++++++++
 include/linux/compiler.h      |  93 +------------------------------
 4 files changed, 105 insertions(+), 92 deletions(-)
 create mode 100644 include/asm-generic/rwonce.h

diff --git a/include/asm-generic/Kbuild b/include/asm-generic/Kbuild
index 44ec80e70518..74b0612601dd 100644
--- a/include/asm-generic/Kbuild
+++ b/include/asm-generic/Kbuild
@@ -45,6 +45,7 @@ mandatory-y += pci.h
 mandatory-y += percpu.h
 mandatory-y += pgalloc.h
 mandatory-y += preempt.h
+mandatory-y += rwonce.h
 mandatory-y += sections.h
 mandatory-y += serial.h
 mandatory-y += shmparam.h
diff --git a/include/asm-generic/barrier.h b/include/asm-generic/barrier.h
index 2eacaf7d62f6..8116744bb82c 100644
--- a/include/asm-generic/barrier.h
+++ b/include/asm-generic/barrier.h
@@ -13,7 +13,7 @@
 
 #ifndef __ASSEMBLY__
 
-#include <linux/compiler.h>
+#include <asm/rwonce.h>
 
 #ifndef nop
 #define nop()	asm volatile ("nop")
diff --git a/include/asm-generic/rwonce.h b/include/asm-generic/rwonce.h
new file mode 100644
index 000000000000..87584379da43
--- /dev/null
+++ b/include/asm-generic/rwonce.h
@@ -0,0 +1,101 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Prevent the compiler from merging or refetching reads or writes. The
+ * compiler is also forbidden from reordering successive instances of
+ * READ_ONCE and WRITE_ONCE, but only when the compiler is aware of some
+ * particular ordering. One way to make the compiler aware of ordering is to
+ * put the two invocations of READ_ONCE or WRITE_ONCE in different C
+ * statements.
+ *
+ * These two macros will also work on aggregate data types like structs or
+ * unions.
+ *
+ * Their two major use cases are: (1) Mediating communication between
+ * process-level code and irq/NMI handlers, all running on the same CPU,
+ * and (2) Ensuring that the compiler does not fold, spindle, or otherwise
+ * mutilate accesses that either do not require ordering or that interact
+ * with an explicit memory barrier or atomic instruction that provides the
+ * required ordering.
+ */
+#ifndef __ASM_GENERIC_RWONCE_H
+#define __ASM_GENERIC_RWONCE_H
+
+#ifndef __ASSEMBLY__
+
+#include <linux/compiler_types.h>
+#include <linux/kasan-checks.h>
+#include <linux/kcsan-checks.h>
+
+#include <asm/barrier.h>
+
+/*
+ * Yes, this permits 64-bit accesses on 32-bit architectures. These will
+ * actually be atomic in some cases (namely Armv7 + LPAE), but for others we
+ * rely on the access being split into 2x32-bit accesses for a 32-bit quantity
+ * (e.g. a virtual address) and a strong prevailing wind.
+ */
+#define compiletime_assert_rwonce_type(t)					\
+	compiletime_assert(__native_word(t) || sizeof(t) == sizeof(long long),	\
+		"Unsupported access size for {READ,WRITE}_ONCE().")
+
+/*
+ * Use __READ_ONCE() instead of READ_ONCE() if you do not require any
+ * atomicity or dependency ordering guarantees. Note that this may result
+ * in tears!
+ */
+#define __READ_ONCE(x)	(*(const volatile __unqual_scalar_typeof(x) *)&(x))
+
+#define __READ_ONCE_SCALAR(x)						\
+({									\
+	__unqual_scalar_typeof(x) __x = __READ_ONCE(x);			\
+	smp_read_barrier_depends();					\
+	(typeof(x))__x;							\
+})
+
+#define READ_ONCE(x)							\
+({									\
+	compiletime_assert_rwonce_type(x);				\
+	__READ_ONCE_SCALAR(x);						\
+})
+
+#define __WRITE_ONCE(x, val)						\
+do {									\
+	*(volatile typeof(x) *)&(x) = (val);				\
+} while (0)
+
+#define WRITE_ONCE(x, val)						\
+do {									\
+	compiletime_assert_rwonce_type(x);				\
+	__WRITE_ONCE(x, val);						\
+} while (0)
+
+static __no_sanitize_or_inline
+unsigned long __read_once_word_nocheck(const void *addr)
+{
+	return __READ_ONCE(*(unsigned long *)addr);
+}
+
+/*
+ * Use READ_ONCE_NOCHECK() instead of READ_ONCE() if you need to load a
+ * word from memory atomically but without telling KASAN/KCSAN. This is
+ * usually used by unwinding code when walking the stack of a running process.
+ */
+#define READ_ONCE_NOCHECK(x)						\
+({									\
+	unsigned long __x;						\
+	compiletime_assert(sizeof(x) == sizeof(__x),			\
+		"Unsupported access size for READ_ONCE_NOCHECK().");	\
+	__x = __read_once_word_nocheck(&(x));				\
+	smp_read_barrier_depends();					\
+	(typeof(x))__x;							\
+})
+
+static __no_kasan_or_inline
+unsigned long read_word_at_a_time(const void *addr)
+{
+	kasan_check_read(addr, 1);
+	return *(unsigned long *)addr;
+}
+
+#endif /* __ASSEMBLY__ */
+#endif	/* __ASM_GENERIC_RWONCE_H */
diff --git a/include/linux/compiler.h b/include/linux/compiler.h
index 204e76856435..f075a3df4fe2 100644
--- a/include/linux/compiler.h
+++ b/include/linux/compiler.h
@@ -230,28 +230,6 @@ void ftrace_likely_update(struct ftrace_likely_data *f, int val,
 # define __UNIQUE_ID(prefix) __PASTE(__PASTE(__UNIQUE_ID_, prefix), __LINE__)
 #endif
 
-/*
- * Prevent the compiler from merging or refetching reads or writes. The
- * compiler is also forbidden from reordering successive instances of
- * READ_ONCE and WRITE_ONCE, but only when the compiler is aware of some
- * particular ordering. One way to make the compiler aware of ordering is to
- * put the two invocations of READ_ONCE or WRITE_ONCE in different C
- * statements.
- *
- * These two macros will also work on aggregate data types like structs or
- * unions.
- *
- * Their two major use cases are: (1) Mediating communication between
- * process-level code and irq/NMI handlers, all running on the same CPU,
- * and (2) Ensuring that the compiler does not fold, spindle, or otherwise
- * mutilate accesses that either do not require ordering or that interact
- * with an explicit memory barrier or atomic instruction that provides the
- * required ordering.
- */
-#include <asm/barrier.h>
-#include <linux/kasan-checks.h>
-#include <linux/kcsan-checks.h>
-
 /**
  * data_race - mark an expression as containing intentional data races
  *
@@ -272,65 +250,6 @@ void ftrace_likely_update(struct ftrace_likely_data *f, int val,
 	__v;								\
 })
 
-/*
- * Use __READ_ONCE() instead of READ_ONCE() if you do not require any
- * atomicity or dependency ordering guarantees. Note that this may result
- * in tears!
- */
-#define __READ_ONCE(x)	(*(const volatile __unqual_scalar_typeof(x) *)&(x))
-
-#define __READ_ONCE_SCALAR(x)						\
-({									\
-	__unqual_scalar_typeof(x) __x = __READ_ONCE(x);			\
-	smp_read_barrier_depends();					\
-	(typeof(x))__x;							\
-})
-
-#define READ_ONCE(x)							\
-({									\
-	compiletime_assert_rwonce_type(x);				\
-	__READ_ONCE_SCALAR(x);						\
-})
-
-#define __WRITE_ONCE(x, val)						\
-do {									\
-	*(volatile typeof(x) *)&(x) = (val);				\
-} while (0)
-
-#define WRITE_ONCE(x, val)						\
-do {									\
-	compiletime_assert_rwonce_type(x);				\
-	__WRITE_ONCE(x, val);						\
-} while (0)
-
-static __no_sanitize_or_inline
-unsigned long __read_once_word_nocheck(const void *addr)
-{
-	return __READ_ONCE(*(unsigned long *)addr);
-}
-
-/*
- * Use READ_ONCE_NOCHECK() instead of READ_ONCE() if you need to load a
- * word from memory atomically but without telling KASAN/KCSAN. This is
- * usually used by unwinding code when walking the stack of a running process.
- */
-#define READ_ONCE_NOCHECK(x)						\
-({									\
-	unsigned long __x;						\
-	compiletime_assert(sizeof(x) == sizeof(__x),			\
-		"Unsupported access size for READ_ONCE_NOCHECK().");	\
-	__x = __read_once_word_nocheck(&(x));				\
-	smp_read_barrier_depends();					\
-	(typeof(x))__x;							\
-})
-
-static __no_kasan_or_inline
-unsigned long read_word_at_a_time(const void *addr)
-{
-	kasan_check_read(addr, 1);
-	return *(unsigned long *)addr;
-}
-
 #endif /* __KERNEL__ */
 
 /*
@@ -395,16 +314,6 @@ static inline void *offset_to_ptr(const int *off)
 	compiletime_assert(__native_word(t),				\
 		"Need native word sized stores/loads for atomicity.")
 
-/*
- * Yes, this permits 64-bit accesses on 32-bit architectures. These will
- * actually be atomic in some cases (namely Armv7 + LPAE), but for others we
- * rely on the access being split into 2x32-bit accesses for a 32-bit quantity
- * (e.g. a virtual address) and a strong prevailing wind.
- */
-#define compiletime_assert_rwonce_type(t)					\
-	compiletime_assert(__native_word(t) || sizeof(t) == sizeof(long long),	\
-		"Unsupported access size for {READ,WRITE}_ONCE().")
-
 /* &a[0] degrades to a pointer: a different type from an array */
 #define __must_be_array(a)	BUILD_BUG_ON_ZERO(__same_type((a), &(a)[0]))
 
@@ -414,4 +323,6 @@ static inline void *offset_to_ptr(const int *off)
  */
 #define prevent_tail_call_optimization()	mb()
 
+#include <asm/rwonce.h>
+
 #endif /* __LINUX_COMPILER_H */

From b78b331a3f5c0773171dadd6bbfa2a2242b45604 Mon Sep 17 00:00:00 2001
From: Will Deacon <will@kernel.org>
Date: Tue, 15 Oct 2019 17:30:47 -0700
Subject: [PATCH 376/502] asm/rwonce: Allow __READ_ONCE to be overridden by the
 architecture

The meat and potatoes of READ_ONCE() is defined by the __READ_ONCE()
macro, which uses a volatile casts in an attempt to avoid tearing of
byte, halfword, word and double-word accesses. Allow this to be
overridden by the architecture code in the case that things like memory
barriers are also required.

Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Paul E. McKenney <paulmck@kernel.org>
Signed-off-by: Will Deacon <will@kernel.org>
---
 include/asm-generic/rwonce.h | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/include/asm-generic/rwonce.h b/include/asm-generic/rwonce.h
index 87584379da43..04586b55a7c2 100644
--- a/include/asm-generic/rwonce.h
+++ b/include/asm-generic/rwonce.h
@@ -43,7 +43,9 @@
  * atomicity or dependency ordering guarantees. Note that this may result
  * in tears!
  */
+#ifndef __READ_ONCE
 #define __READ_ONCE(x)	(*(const volatile __unqual_scalar_typeof(x) *)&(x))
+#endif
 
 #define __READ_ONCE_SCALAR(x)						\
 ({									\

From d6462858851549c62d73eaa14b31132b0f32d6b6 Mon Sep 17 00:00:00 2001
From: Will Deacon <will@kernel.org>
Date: Wed, 30 Oct 2019 16:50:10 +0000
Subject: [PATCH 377/502] alpha: Override READ_ONCE() with barriered
 implementation

Rather then relying on the core code to use smp_read_barrier_depends()
as part of the READ_ONCE() definition, instead override __READ_ONCE()
in the Alpha code so that it generates the required mb() and then
implement smp_load_acquire() using the new macro to avoid redundant
back-to-back barriers from the generic implementation.

Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Paul E. McKenney <paulmck@kernel.org>
Signed-off-by: Will Deacon <will@kernel.org>
---
 arch/alpha/include/asm/barrier.h | 59 +++-----------------------------
 arch/alpha/include/asm/rwonce.h  | 35 +++++++++++++++++++
 2 files changed, 40 insertions(+), 54 deletions(-)
 create mode 100644 arch/alpha/include/asm/rwonce.h

diff --git a/arch/alpha/include/asm/barrier.h b/arch/alpha/include/asm/barrier.h
index 92ec486a4f9e..c56bfffc9918 100644
--- a/arch/alpha/include/asm/barrier.h
+++ b/arch/alpha/include/asm/barrier.h
@@ -2,64 +2,15 @@
 #ifndef __BARRIER_H
 #define __BARRIER_H
 
-#include <asm/compiler.h>
-
 #define mb()	__asm__ __volatile__("mb": : :"memory")
 #define rmb()	__asm__ __volatile__("mb": : :"memory")
 #define wmb()	__asm__ __volatile__("wmb": : :"memory")
 
-/**
- * read_barrier_depends - Flush all pending reads that subsequents reads
- * depend on.
- *
- * No data-dependent reads from memory-like regions are ever reordered
- * over this barrier.  All reads preceding this primitive are guaranteed
- * to access memory (but not necessarily other CPUs' caches) before any
- * reads following this primitive that depend on the data return by
- * any of the preceding reads.  This primitive is much lighter weight than
- * rmb() on most CPUs, and is never heavier weight than is
- * rmb().
- *
- * These ordering constraints are respected by both the local CPU
- * and the compiler.
- *
- * Ordering is not guaranteed by anything other than these primitives,
- * not even by data dependencies.  See the documentation for
- * memory_barrier() for examples and URLs to more information.
- *
- * For example, the following code would force ordering (the initial
- * value of "a" is zero, "b" is one, and "p" is "&a"):
- *
- * <programlisting>
- *	CPU 0				CPU 1
- *
- *	b = 2;
- *	memory_barrier();
- *	p = &b;				q = p;
- *					read_barrier_depends();
- *					d = *q;
- * </programlisting>
- *
- * because the read of "*q" depends on the read of "p" and these
- * two reads are separated by a read_barrier_depends().  However,
- * the following code, with the same initial values for "a" and "b":
- *
- * <programlisting>
- *	CPU 0				CPU 1
- *
- *	a = 2;
- *	memory_barrier();
- *	b = 3;				y = b;
- *					read_barrier_depends();
- *					x = a;
- * </programlisting>
- *
- * does not enforce ordering, since there is no data dependency between
- * the read of "a" and the read of "b".  Therefore, on some CPUs, such
- * as Alpha, "y" could be set to 3 and "x" to 0.  Use rmb()
- * in cases like this where there are no data dependencies.
- */
-#define read_barrier_depends() __asm__ __volatile__("mb": : :"memory")
+#define __smp_load_acquire(p)						\
+({									\
+	compiletime_assert_atomic_type(*p);				\
+	__READ_ONCE(*p);						\
+})
 
 #ifdef CONFIG_SMP
 #define __ASM_SMP_MB	"\tmb\n"
diff --git a/arch/alpha/include/asm/rwonce.h b/arch/alpha/include/asm/rwonce.h
new file mode 100644
index 000000000000..35542bcf92b3
--- /dev/null
+++ b/arch/alpha/include/asm/rwonce.h
@@ -0,0 +1,35 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (C) 2019 Google LLC.
+ */
+#ifndef __ASM_RWONCE_H
+#define __ASM_RWONCE_H
+
+#ifdef CONFIG_SMP
+
+#include <asm/barrier.h>
+
+/*
+ * Alpha is apparently daft enough to reorder address-dependent loads
+ * on some CPU implementations. Knock some common sense into it with
+ * a memory barrier in READ_ONCE().
+ *
+ * For the curious, more information about this unusual reordering is
+ * available in chapter 15 of the "perfbook":
+ *
+ *  https://kernel.org/pub/linux/kernel/people/paulmck/perfbook/perfbook.html
+ *
+ */
+#define __READ_ONCE(x)							\
+({									\
+	__unqual_scalar_typeof(x) __x =					\
+		(*(volatile typeof(__x) *)(&(x)));			\
+	mb();								\
+	(typeof(x))__x;							\
+})
+
+#endif /* CONFIG_SMP */
+
+#include <asm-generic/rwonce.h>
+
+#endif /* __ASM_RWONCE_H */

From 3c9184109e78ea2371ca8fa66d7f36986a53af98 Mon Sep 17 00:00:00 2001
From: Will Deacon <will@kernel.org>
Date: Wed, 30 Oct 2019 16:51:07 +0000
Subject: [PATCH 378/502] asm/rwonce: Remove smp_read_barrier_depends()
 invocation

Alpha overrides __READ_ONCE() directly, so there's no need to use
smp_read_barrier_depends() in the core code. This also means that
__READ_ONCE() can be relied upon to provide dependency ordering.

Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Paul E. McKenney <paulmck@kernel.org>
Signed-off-by: Will Deacon <will@kernel.org>
---
 include/asm-generic/rwonce.h | 19 ++++---------------
 1 file changed, 4 insertions(+), 15 deletions(-)

diff --git a/include/asm-generic/rwonce.h b/include/asm-generic/rwonce.h
index 04586b55a7c2..3a7f737c77bd 100644
--- a/include/asm-generic/rwonce.h
+++ b/include/asm-generic/rwonce.h
@@ -40,24 +40,16 @@
 
 /*
  * Use __READ_ONCE() instead of READ_ONCE() if you do not require any
- * atomicity or dependency ordering guarantees. Note that this may result
- * in tears!
+ * atomicity. Note that this may result in tears!
  */
 #ifndef __READ_ONCE
 #define __READ_ONCE(x)	(*(const volatile __unqual_scalar_typeof(x) *)&(x))
 #endif
 
-#define __READ_ONCE_SCALAR(x)						\
-({									\
-	__unqual_scalar_typeof(x) __x = __READ_ONCE(x);			\
-	smp_read_barrier_depends();					\
-	(typeof(x))__x;							\
-})
-
 #define READ_ONCE(x)							\
 ({									\
 	compiletime_assert_rwonce_type(x);				\
-	__READ_ONCE_SCALAR(x);						\
+	__READ_ONCE(x);							\
 })
 
 #define __WRITE_ONCE(x, val)						\
@@ -84,12 +76,9 @@ unsigned long __read_once_word_nocheck(const void *addr)
  */
 #define READ_ONCE_NOCHECK(x)						\
 ({									\
-	unsigned long __x;						\
-	compiletime_assert(sizeof(x) == sizeof(__x),			\
+	compiletime_assert(sizeof(x) == sizeof(unsigned long),		\
 		"Unsupported access size for READ_ONCE_NOCHECK().");	\
-	__x = __read_once_word_nocheck(&(x));				\
-	smp_read_barrier_depends();					\
-	(typeof(x))__x;							\
+	(typeof(x))__read_once_word_nocheck(&(x));			\
 })
 
 static __no_kasan_or_inline

From 002dff36acfba3476b685a09f78ffb7c452f5951 Mon Sep 17 00:00:00 2001
From: Will Deacon <will@kernel.org>
Date: Fri, 10 Jul 2020 14:49:40 +0100
Subject: [PATCH 379/502] asm/rwonce: Don't pull <asm/barrier.h> into
 'asm-generic/rwonce.h'

Now that 'smp_read_barrier_depends()' has gone the way of the Norwegian
Blue, drop the inclusion of <asm/barrier.h> in 'asm-generic/rwonce.h'.

This requires fixups to some architecture vdso headers which were
previously relying on 'asm/barrier.h' coming in via 'linux/compiler.h'.

Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Will Deacon <will@kernel.org>
---
 arch/arm/include/asm/vdso/gettimeofday.h          | 1 +
 arch/arm64/include/asm/vdso/compat_gettimeofday.h | 1 +
 arch/arm64/include/asm/vdso/gettimeofday.h        | 1 +
 arch/riscv/include/asm/vdso/gettimeofday.h        | 1 +
 include/asm-generic/rwonce.h                      | 2 --
 include/linux/nospec.h                            | 2 ++
 6 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/arch/arm/include/asm/vdso/gettimeofday.h b/arch/arm/include/asm/vdso/gettimeofday.h
index 36dc18553ed8..1b207cf07697 100644
--- a/arch/arm/include/asm/vdso/gettimeofday.h
+++ b/arch/arm/include/asm/vdso/gettimeofday.h
@@ -7,6 +7,7 @@
 
 #ifndef __ASSEMBLY__
 
+#include <asm/barrier.h>
 #include <asm/errno.h>
 #include <asm/unistd.h>
 #include <asm/vdso/cp15.h>
diff --git a/arch/arm64/include/asm/vdso/compat_gettimeofday.h b/arch/arm64/include/asm/vdso/compat_gettimeofday.h
index b6907ae78e53..bcf7649999a4 100644
--- a/arch/arm64/include/asm/vdso/compat_gettimeofday.h
+++ b/arch/arm64/include/asm/vdso/compat_gettimeofday.h
@@ -7,6 +7,7 @@
 
 #ifndef __ASSEMBLY__
 
+#include <asm/barrier.h>
 #include <asm/unistd.h>
 #include <asm/errno.h>
 
diff --git a/arch/arm64/include/asm/vdso/gettimeofday.h b/arch/arm64/include/asm/vdso/gettimeofday.h
index afba6ba332f8..127fa63893e2 100644
--- a/arch/arm64/include/asm/vdso/gettimeofday.h
+++ b/arch/arm64/include/asm/vdso/gettimeofday.h
@@ -7,6 +7,7 @@
 
 #ifndef __ASSEMBLY__
 
+#include <asm/barrier.h>
 #include <asm/unistd.h>
 
 #define VDSO_HAS_CLOCK_GETRES		1
diff --git a/arch/riscv/include/asm/vdso/gettimeofday.h b/arch/riscv/include/asm/vdso/gettimeofday.h
index c8e818688ec1..3099362d9f26 100644
--- a/arch/riscv/include/asm/vdso/gettimeofday.h
+++ b/arch/riscv/include/asm/vdso/gettimeofday.h
@@ -4,6 +4,7 @@
 
 #ifndef __ASSEMBLY__
 
+#include <asm/barrier.h>
 #include <asm/unistd.h>
 #include <asm/csr.h>
 #include <uapi/linux/time.h>
diff --git a/include/asm-generic/rwonce.h b/include/asm-generic/rwonce.h
index 3a7f737c77bd..8d0a6280e982 100644
--- a/include/asm-generic/rwonce.h
+++ b/include/asm-generic/rwonce.h
@@ -26,8 +26,6 @@
 #include <linux/kasan-checks.h>
 #include <linux/kcsan-checks.h>
 
-#include <asm/barrier.h>
-
 /*
  * Yes, this permits 64-bit accesses on 32-bit architectures. These will
  * actually be atomic in some cases (namely Armv7 + LPAE), but for others we
diff --git a/include/linux/nospec.h b/include/linux/nospec.h
index 0c5ef54fd416..c1e79f72cd89 100644
--- a/include/linux/nospec.h
+++ b/include/linux/nospec.h
@@ -5,6 +5,8 @@
 
 #ifndef _LINUX_NOSPEC_H
 #define _LINUX_NOSPEC_H
+
+#include <linux/compiler.h>
 #include <asm/barrier.h>
 
 struct task_struct;

From 71c0b9a65cefa8c34eab83d337a1e3ec61fb7cc2 Mon Sep 17 00:00:00 2001
From: Will Deacon <will@kernel.org>
Date: Wed, 30 Oct 2019 16:22:17 +0000
Subject: [PATCH 380/502] vhost: Remove redundant use of read_barrier_depends()
 barrier

Since commit 76ebbe78f739 ("locking/barriers: Add implicit
smp_read_barrier_depends() to READ_ONCE()"), there is no need to use
smp_read_barrier_depends() outside of the Alpha architecture code.

Unfortunately, there is precisely _one_ user in the vhost code, and
there isn't an obvious READ_ONCE() access making the barrier
redundant. However, on closer inspection (thanks, Jason), it appears
that vring synchronisation between the producer and consumer occurs via
the 'avail_idx' field, which is followed up by an rmb() in
vhost_get_vq_desc(), making the read_barrier_depends() redundant on
Alpha.

Jason says:

  | I'm also confused about the barrier here, basically in driver side
  | we did:
  |
  | 1) allocate pages
  | 2) store pages in indirect->addr
  | 3) smp_wmb()
  | 4) increase the avail idx (somehow a tail pointer of vring)
  |
  | in vhost we did:
  |
  | 1) read avail idx
  | 2) smp_rmb()
  | 3) read indirect->addr
  | 4) read from indirect->addr
  |
  | It looks to me even the data dependency barrier is not necessary
  | since we have rmb() which is sufficient for us to the correct
  | indirect->addr and driver are not expected to do any writing to
  | indirect->addr after avail idx is increased

Remove the redundant barrier invocation.

Acked-by: Michael S. Tsirkin <mst@redhat.com>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Suggested-by: Jason Wang <jasowang@redhat.com>
Acked-by: Paul E. McKenney <paulmck@kernel.org>
Signed-off-by: Will Deacon <will@kernel.org>
---
 drivers/vhost/vhost.c | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index d7b8df3edffc..74d135ee7e26 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -2092,11 +2092,6 @@ static int get_indirect(struct vhost_virtqueue *vq,
 		return ret;
 	}
 	iov_iter_init(&from, READ, vq->indirect, ret, len);
-
-	/* We will use the result as an address to read from, so most
-	 * architectures only need a compiler barrier here. */
-	read_barrier_depends();
-
 	count = len / sizeof desc;
 	/* Buffers are chained via a 16 bit next field, so
 	 * we can have at most 2^16 of these. */

From bb7cdd38185a4f9fa32e62db115c2c6dceb2b621 Mon Sep 17 00:00:00 2001
From: Will Deacon <will@kernel.org>
Date: Wed, 30 Oct 2019 17:15:01 +0000
Subject: [PATCH 381/502] alpha: Replace smp_read_barrier_depends() usage with
 smp_[r]mb()

In preparation for removing smp_read_barrier_depends() altogether,
move the Alpha code over to using smp_rmb() and smp_mb() directly.

Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Paul E. McKenney <paulmck@kernel.org>
Signed-off-by: Will Deacon <will@kernel.org>
---
 arch/alpha/include/asm/atomic.h  | 16 ++++++++--------
 arch/alpha/include/asm/pgtable.h | 10 +++++-----
 mm/memory.c                      |  2 +-
 3 files changed, 14 insertions(+), 14 deletions(-)

diff --git a/arch/alpha/include/asm/atomic.h b/arch/alpha/include/asm/atomic.h
index 2144530d1428..2f8f7e54792f 100644
--- a/arch/alpha/include/asm/atomic.h
+++ b/arch/alpha/include/asm/atomic.h
@@ -16,10 +16,10 @@
 
 /*
  * To ensure dependency ordering is preserved for the _relaxed and
- * _release atomics, an smp_read_barrier_depends() is unconditionally
- * inserted into the _relaxed variants, which are used to build the
- * barriered versions. Avoid redundant back-to-back fences in the
- * _acquire and _fence versions.
+ * _release atomics, an smp_mb() is unconditionally inserted into the
+ * _relaxed variants, which are used to build the barriered versions.
+ * Avoid redundant back-to-back fences in the _acquire and _fence
+ * versions.
  */
 #define __atomic_acquire_fence()
 #define __atomic_post_full_fence()
@@ -70,7 +70,7 @@ static inline int atomic_##op##_return_relaxed(int i, atomic_t *v)	\
 	".previous"							\
 	:"=&r" (temp), "=m" (v->counter), "=&r" (result)		\
 	:"Ir" (i), "m" (v->counter) : "memory");			\
-	smp_read_barrier_depends();					\
+	smp_mb();							\
 	return result;							\
 }
 
@@ -88,7 +88,7 @@ static inline int atomic_fetch_##op##_relaxed(int i, atomic_t *v)	\
 	".previous"							\
 	:"=&r" (temp), "=m" (v->counter), "=&r" (result)		\
 	:"Ir" (i), "m" (v->counter) : "memory");			\
-	smp_read_barrier_depends();					\
+	smp_mb();							\
 	return result;							\
 }
 
@@ -123,7 +123,7 @@ static __inline__ s64 atomic64_##op##_return_relaxed(s64 i, atomic64_t * v)	\
 	".previous"							\
 	:"=&r" (temp), "=m" (v->counter), "=&r" (result)		\
 	:"Ir" (i), "m" (v->counter) : "memory");			\
-	smp_read_barrier_depends();					\
+	smp_mb();							\
 	return result;							\
 }
 
@@ -141,7 +141,7 @@ static __inline__ s64 atomic64_fetch_##op##_relaxed(s64 i, atomic64_t * v)	\
 	".previous"							\
 	:"=&r" (temp), "=m" (v->counter), "=&r" (result)		\
 	:"Ir" (i), "m" (v->counter) : "memory");			\
-	smp_read_barrier_depends();					\
+	smp_mb();							\
 	return result;							\
 }
 
diff --git a/arch/alpha/include/asm/pgtable.h b/arch/alpha/include/asm/pgtable.h
index 162c17b2631f..660b14ce1317 100644
--- a/arch/alpha/include/asm/pgtable.h
+++ b/arch/alpha/include/asm/pgtable.h
@@ -277,9 +277,9 @@ extern inline pte_t pte_mkdirty(pte_t pte)	{ pte_val(pte) |= __DIRTY_BITS; retur
 extern inline pte_t pte_mkyoung(pte_t pte)	{ pte_val(pte) |= __ACCESS_BITS; return pte; }
 
 /*
- * The smp_read_barrier_depends() in the following functions are required to
- * order the load of *dir (the pointer in the top level page table) with any
- * subsequent load of the returned pmd_t *ret (ret is data dependent on *dir).
+ * The smp_rmb() in the following functions are required to order the load of
+ * *dir (the pointer in the top level page table) with any subsequent load of
+ * the returned pmd_t *ret (ret is data dependent on *dir).
  *
  * If this ordering is not enforced, the CPU might load an older value of
  * *ret, which may be uninitialized data. See mm/memory.c:__pte_alloc for
@@ -293,7 +293,7 @@ extern inline pte_t pte_mkyoung(pte_t pte)	{ pte_val(pte) |= __ACCESS_BITS; retu
 extern inline pmd_t * pmd_offset(pud_t * dir, unsigned long address)
 {
 	pmd_t *ret = (pmd_t *) pud_page_vaddr(*dir) + ((address >> PMD_SHIFT) & (PTRS_PER_PAGE - 1));
-	smp_read_barrier_depends(); /* see above */
+	smp_rmb(); /* see above */
 	return ret;
 }
 #define pmd_offset pmd_offset
@@ -303,7 +303,7 @@ extern inline pte_t * pte_offset_kernel(pmd_t * dir, unsigned long address)
 {
 	pte_t *ret = (pte_t *) pmd_page_vaddr(*dir)
 		+ ((address >> PAGE_SHIFT) & (PTRS_PER_PAGE - 1));
-	smp_read_barrier_depends(); /* see above */
+	smp_rmb(); /* see above */
 	return ret;
 }
 #define pte_offset_kernel pte_offset_kernel
diff --git a/mm/memory.c b/mm/memory.c
index 87ec87cdc1ff..e1f2c730d8bb 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -437,7 +437,7 @@ int __pte_alloc(struct mm_struct *mm, pmd_t *pmd)
 	 * of a chain of data-dependent loads, meaning most CPUs (alpha
 	 * being the notable exception) will already guarantee loads are
 	 * seen in-order. See the alpha page table accessors for the
-	 * smp_read_barrier_depends() barriers in page table walking code.
+	 * smp_rmb() barriers in page table walking code.
 	 */
 	smp_wmb(); /* Could be smp_wmb__xxx(before|after)_spin_lock */
 

From 93fab07c22930c9ac4f01212fd92913c9a812f9f Mon Sep 17 00:00:00 2001
From: Will Deacon <will@kernel.org>
Date: Wed, 30 Oct 2019 17:17:22 +0000
Subject: [PATCH 382/502] locking/barriers: Remove definitions for
 [smp_]read_barrier_depends()

There are no remaining users of [smp_]read_barrier_depends(), so
remove it from the generic implementation of 'barrier.h'.

Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Paul E. McKenney <paulmck@kernel.org>
Signed-off-by: Will Deacon <will@kernel.org>
---
 include/asm-generic/barrier.h | 17 -----------------
 1 file changed, 17 deletions(-)

diff --git a/include/asm-generic/barrier.h b/include/asm-generic/barrier.h
index 8116744bb82c..fec97dc34de7 100644
--- a/include/asm-generic/barrier.h
+++ b/include/asm-generic/barrier.h
@@ -46,10 +46,6 @@
 #define dma_wmb()	wmb()
 #endif
 
-#ifndef read_barrier_depends
-#define read_barrier_depends()		do { } while (0)
-#endif
-
 #ifndef __smp_mb
 #define __smp_mb()	mb()
 #endif
@@ -62,10 +58,6 @@
 #define __smp_wmb()	wmb()
 #endif
 
-#ifndef __smp_read_barrier_depends
-#define __smp_read_barrier_depends()	read_barrier_depends()
-#endif
-
 #ifdef CONFIG_SMP
 
 #ifndef smp_mb
@@ -80,10 +72,6 @@
 #define smp_wmb()	__smp_wmb()
 #endif
 
-#ifndef smp_read_barrier_depends
-#define smp_read_barrier_depends()	__smp_read_barrier_depends()
-#endif
-
 #else	/* !CONFIG_SMP */
 
 #ifndef smp_mb
@@ -98,10 +86,6 @@
 #define smp_wmb()	barrier()
 #endif
 
-#ifndef smp_read_barrier_depends
-#define smp_read_barrier_depends()	do { } while (0)
-#endif
-
 #endif	/* CONFIG_SMP */
 
 #ifndef __smp_store_mb
@@ -196,7 +180,6 @@ do {									\
 #define virt_mb() __smp_mb()
 #define virt_rmb() __smp_rmb()
 #define virt_wmb() __smp_wmb()
-#define virt_read_barrier_depends() __smp_read_barrier_depends()
 #define virt_store_mb(var, value) __smp_store_mb(var, value)
 #define virt_mb__before_atomic() __smp_mb__before_atomic()
 #define virt_mb__after_atomic()	__smp_mb__after_atomic()

From 8ca924aeb4f28e5bf552707e8ecbe105c4f17c7b Mon Sep 17 00:00:00 2001
From: Will Deacon <will@kernel.org>
Date: Thu, 7 Nov 2019 14:36:37 +0000
Subject: [PATCH 383/502] Documentation/barriers: Remove references to
 [smp_]read_barrier_depends()

The [smp_]read_barrier_depends() barrier macros no longer exist as
part of the Linux memory model, so remove all references to them from
the Documentation/ directory.

Although this is fairly mechanical on the whole, we drop the "CACHE
COHERENCY" section entirely from 'memory-barriers.txt' as it doesn't
make any sense now that the dependency barriers have been removed.

Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Paul E. McKenney <paulmck@kernel.org>
Signed-off-by: Will Deacon <will@kernel.org>
---
 .../RCU/Design/Requirements/Requirements.rst  |   2 +-
 Documentation/memory-barriers.txt             | 156 +-----------------
 2 files changed, 9 insertions(+), 149 deletions(-)

diff --git a/Documentation/RCU/Design/Requirements/Requirements.rst b/Documentation/RCU/Design/Requirements/Requirements.rst
index 75b8ca007a11..50d5c43c48b0 100644
--- a/Documentation/RCU/Design/Requirements/Requirements.rst
+++ b/Documentation/RCU/Design/Requirements/Requirements.rst
@@ -463,7 +463,7 @@ again without disrupting RCU readers.
 This guarantee was only partially premeditated. DYNIX/ptx used an
 explicit memory barrier for publication, but had nothing resembling
 ``rcu_dereference()`` for subscription, nor did it have anything
-resembling the ``smp_read_barrier_depends()`` that was later subsumed
+resembling the dependency-ordering barrier that was later subsumed
 into ``rcu_dereference()`` and later still into ``READ_ONCE()``. The
 need for these operations made itself known quite suddenly at a
 late-1990s meeting with the DEC Alpha architects, back in the days when
diff --git a/Documentation/memory-barriers.txt b/Documentation/memory-barriers.txt
index eaabc3134294..4e55aba3eb4a 100644
--- a/Documentation/memory-barriers.txt
+++ b/Documentation/memory-barriers.txt
@@ -553,12 +553,12 @@ There are certain things that the Linux kernel memory barriers do not guarantee:
 DATA DEPENDENCY BARRIERS (HISTORICAL)
 -------------------------------------
 
-As of v4.15 of the Linux kernel, an smp_read_barrier_depends() was
-added to READ_ONCE(), which means that about the only people who
-need to pay attention to this section are those working on DEC Alpha
-architecture-specific code and those working on READ_ONCE() itself.
-For those who need it, and for those who are interested in the history,
-here is the story of data-dependency barriers.
+As of v4.15 of the Linux kernel, an smp_mb() was added to READ_ONCE() for
+DEC Alpha, which means that about the only people who need to pay attention
+to this section are those working on DEC Alpha architecture-specific code
+and those working on READ_ONCE() itself.  For those who need it, and for
+those who are interested in the history, here is the story of
+data-dependency barriers.
 
 The usage requirements of data dependency barriers are a little subtle, and
 it's not always obvious that they're needed.  To illustrate, consider the
@@ -2708,144 +2708,6 @@ the properties of the memory window through which devices are accessed and/or
 the use of any special device communication instructions the CPU may have.
 
 
-CACHE COHERENCY
----------------
-
-Life isn't quite as simple as it may appear above, however: for while the
-caches are expected to be coherent, there's no guarantee that that coherency
-will be ordered.  This means that while changes made on one CPU will
-eventually become visible on all CPUs, there's no guarantee that they will
-become apparent in the same order on those other CPUs.
-
-
-Consider dealing with a system that has a pair of CPUs (1 & 2), each of which
-has a pair of parallel data caches (CPU 1 has A/B, and CPU 2 has C/D):
-
-	            :
-	            :                          +--------+
-	            :      +---------+         |        |
-	+--------+  : +--->| Cache A |<------->|        |
-	|        |  : |    +---------+         |        |
-	|  CPU 1 |<---+                        |        |
-	|        |  : |    +---------+         |        |
-	+--------+  : +--->| Cache B |<------->|        |
-	            :      +---------+         |        |
-	            :                          | Memory |
-	            :      +---------+         | System |
-	+--------+  : +--->| Cache C |<------->|        |
-	|        |  : |    +---------+         |        |
-	|  CPU 2 |<---+                        |        |
-	|        |  : |    +---------+         |        |
-	+--------+  : +--->| Cache D |<------->|        |
-	            :      +---------+         |        |
-	            :                          +--------+
-	            :
-
-Imagine the system has the following properties:
-
- (*) an odd-numbered cache line may be in cache A, cache C or it may still be
-     resident in memory;
-
- (*) an even-numbered cache line may be in cache B, cache D or it may still be
-     resident in memory;
-
- (*) while the CPU core is interrogating one cache, the other cache may be
-     making use of the bus to access the rest of the system - perhaps to
-     displace a dirty cacheline or to do a speculative load;
-
- (*) each cache has a queue of operations that need to be applied to that cache
-     to maintain coherency with the rest of the system;
-
- (*) the coherency queue is not flushed by normal loads to lines already
-     present in the cache, even though the contents of the queue may
-     potentially affect those loads.
-
-Imagine, then, that two writes are made on the first CPU, with a write barrier
-between them to guarantee that they will appear to reach that CPU's caches in
-the requisite order:
-
-	CPU 1		CPU 2		COMMENT
-	===============	===============	=======================================
-					u == 0, v == 1 and p == &u, q == &u
-	v = 2;
-	smp_wmb();			Make sure change to v is visible before
-					 change to p
-	<A:modify v=2>			v is now in cache A exclusively
-	p = &v;
-	<B:modify p=&v>			p is now in cache B exclusively
-
-The write memory barrier forces the other CPUs in the system to perceive that
-the local CPU's caches have apparently been updated in the correct order.  But
-now imagine that the second CPU wants to read those values:
-
-	CPU 1		CPU 2		COMMENT
-	===============	===============	=======================================
-	...
-			q = p;
-			x = *q;
-
-The above pair of reads may then fail to happen in the expected order, as the
-cacheline holding p may get updated in one of the second CPU's caches while
-the update to the cacheline holding v is delayed in the other of the second
-CPU's caches by some other cache event:
-
-	CPU 1		CPU 2		COMMENT
-	===============	===============	=======================================
-					u == 0, v == 1 and p == &u, q == &u
-	v = 2;
-	smp_wmb();
-	<A:modify v=2>	<C:busy>
-			<C:queue v=2>
-	p = &v;		q = p;
-			<D:request p>
-	<B:modify p=&v>	<D:commit p=&v>
-			<D:read p>
-			x = *q;
-			<C:read *q>	Reads from v before v updated in cache
-			<C:unbusy>
-			<C:commit v=2>
-
-Basically, while both cachelines will be updated on CPU 2 eventually, there's
-no guarantee that, without intervention, the order of update will be the same
-as that committed on CPU 1.
-
-
-To intervene, we need to interpolate a data dependency barrier or a read
-barrier between the loads (which as of v4.15 is supplied unconditionally
-by the READ_ONCE() macro).  This will force the cache to commit its
-coherency queue before processing any further requests:
-
-	CPU 1		CPU 2		COMMENT
-	===============	===============	=======================================
-					u == 0, v == 1 and p == &u, q == &u
-	v = 2;
-	smp_wmb();
-	<A:modify v=2>	<C:busy>
-			<C:queue v=2>
-	p = &v;		q = p;
-			<D:request p>
-	<B:modify p=&v>	<D:commit p=&v>
-			<D:read p>
-			smp_read_barrier_depends()
-			<C:unbusy>
-			<C:commit v=2>
-			x = *q;
-			<C:read *q>	Reads from v after v updated in cache
-
-
-This sort of problem can be encountered on DEC Alpha processors as they have a
-split cache that improves performance by making better use of the data bus.
-While most CPUs do imply a data dependency barrier on the read when a memory
-access depends on a read, not all do, so it may not be relied on.
-
-Other CPUs may also have split caches, but must coordinate between the various
-cachelets for normal memory accesses.  The semantics of the Alpha removes the
-need for hardware coordination in the absence of memory barriers, which
-permitted Alpha to sport higher CPU clock rates back in the day.  However,
-please note that (again, as of v4.15) smp_read_barrier_depends() should not
-be used except in Alpha arch-specific code and within the READ_ONCE() macro.
-
-
 CACHE COHERENCY VS DMA
 ----------------------
 
@@ -3009,10 +2871,8 @@ caches with the memory coherence system, thus making it seem like pointer
 changes vs new data occur in the right order.
 
 The Alpha defines the Linux kernel's memory model, although as of v4.15
-the Linux kernel's addition of smp_read_barrier_depends() to READ_ONCE()
-greatly reduced Alpha's impact on the memory model.
-
-See the subsection on "Cache Coherency" above.
+the Linux kernel's addition of smp_mb() to READ_ONCE() on Alpha greatly
+reduced its impact on the memory model.
 
 
 VIRTUAL MACHINE GUESTS

From 9ce1b14e74042a3477f880bee675945044880b01 Mon Sep 17 00:00:00 2001
From: SeongJae Park <sj38.park@gmail.com>
Date: Fri, 29 Nov 2019 19:08:37 +0100
Subject: [PATCH 384/502] Documentation/barriers/kokr: Remove references to
 [smp_]read_barrier_depends()

This commit translates commit ("Documentation/barriers: Remove references to
[smp_]read_barrier_depends()") into Korean.

Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Yunjae Lee <lyj7694@gmail.com>
Signed-off-by: SeongJae Park <sjpark@amazon.de>
Signed-off-by: Will Deacon <will@kernel.org>
---
 .../translations/ko_KR/memory-barriers.txt    | 146 +-----------------
 1 file changed, 3 insertions(+), 143 deletions(-)

diff --git a/Documentation/translations/ko_KR/memory-barriers.txt b/Documentation/translations/ko_KR/memory-barriers.txt
index 34d041d68f78..a1f772ef622c 100644
--- a/Documentation/translations/ko_KR/memory-barriers.txt
+++ b/Documentation/translations/ko_KR/memory-barriers.txt
@@ -577,7 +577,7 @@ ACQUIRE 는 해당 오퍼레이션의 로드 부분에만 적용되고 RELEASE 
 데이터 의존성 배리어 (역사적)
 -----------------------------
 
-리눅스 커널 v4.15 기준으로, smp_read_barrier_depends() 가 READ_ONCE() 에
+리눅스 커널 v4.15 기준으로, smp_mb() 가 DEC Alpha 용 READ_ONCE() 코드에
 추가되었는데, 이는 이 섹션에 주의를 기울여야 하는 사람들은 DEC Alpha 아키텍쳐
 전용 코드를 만드는 사람들과 READ_ONCE() 자체를 만드는 사람들 뿐임을 의미합니다.
 그런 분들을 위해, 그리고 역사에 관심 있는 분들을 위해, 여기 데이터 의존성
@@ -2664,144 +2664,6 @@ CPU 코어는 프로그램의 인과성이 유지된다고만 여겨진다면 
 수도 있습니다.
 
 
-캐시 일관성
------------
-
-하지만 삶은 앞에서 이야기한 것처럼 단순하지 않습니다: 캐시들은 일관적일 것으로
-기대되지만, 그 일관성이 순서에도 적용될 거라는 보장은 없습니다.  한 CPU 에서
-만들어진 변경 사항은 최종적으로는 시스템의 모든 CPU 에게 보여지게 되지만, 다른
-CPU 들에게도 같은 순서로 보이게 될 거라는 보장은 없다는 뜻입니다.
-
-
-두개의 CPU (1 & 2) 가 달려 있고, 각 CPU 에 두개의 데이터 캐시(CPU 1 은 A/B 를,
-CPU 2 는 C/D 를 갖습니다)가 병렬로 연결되어 있는 시스템을 다룬다고 생각해
-봅시다:
-
-	            :
-	            :                          +--------+
-	            :      +---------+         |        |
-	+--------+  : +--->| Cache A |<------->|        |
-	|        |  : |    +---------+         |        |
-	|  CPU 1 |<---+                        |        |
-	|        |  : |    +---------+         |        |
-	+--------+  : +--->| Cache B |<------->|        |
-	            :      +---------+         |        |
-	            :                          | Memory |
-	            :      +---------+         | System |
-	+--------+  : +--->| Cache C |<------->|        |
-	|        |  : |    +---------+         |        |
-	|  CPU 2 |<---+                        |        |
-	|        |  : |    +---------+         |        |
-	+--------+  : +--->| Cache D |<------->|        |
-	            :      +---------+         |        |
-	            :                          +--------+
-	            :
-
-이 시스템이 다음과 같은 특성을 갖는다 생각해 봅시다:
-
- (*) 홀수번 캐시라인은 캐시 A, 캐시 C 또는 메모리에 위치할 수 있음;
-
- (*) 짝수번 캐시라인은 캐시 B, 캐시 D 또는 메모리에 위치할 수 있음;
-
- (*) CPU 코어가 한개의 캐시에 접근하는 동안, 다른 캐시는 - 더티 캐시라인을
-     메모리에 내리거나 추측성 로드를 하거나 하기 위해 - 시스템의 다른 부분에
-     액세스 하기 위해 버스를 사용할 수 있음;
-
- (*) 각 캐시는 시스템의 나머지 부분들과 일관성을 맞추기 위해 해당 캐시에
-     적용되어야 할 오퍼레이션들의 큐를 가짐;
-
- (*) 이 일관성 큐는 캐시에 이미 존재하는 라인에 가해지는 평범한 로드에 의해서는
-     비워지지 않는데, 큐의 오퍼레이션들이 이 로드의 결과에 영향을 끼칠 수 있다
-     할지라도 그러함.
-
-이제, 첫번째 CPU 에서 두개의 쓰기 오퍼레이션을 만드는데, 해당 CPU 의 캐시에
-요청된 순서로 오퍼레이션이 도달됨을 보장하기 위해 두 오퍼레이션 사이에 쓰기
-배리어를 사용하는 상황을 상상해 봅시다:
-
-	CPU 1		CPU 2		COMMENT
-	===============	===============	=======================================
-					u == 0, v == 1 and p == &u, q == &u
-	v = 2;
-	smp_wmb();			v 의 변경이 p 의 변경 전에 보일 것을
-					 분명히 함
-	<A:modify v=2>			v 는 이제 캐시 A 에 독점적으로 존재함
-	p = &v;
-	<B:modify p=&v>			p 는 이제 캐시 B 에 독점적으로 존재함
-
-여기서의 쓰기 메모리 배리어는 CPU 1 의 캐시가 올바른 순서로 업데이트 된 것으로
-시스템의 다른 CPU 들이 인지하게 만듭니다.  하지만, 이제 두번째 CPU 가 그 값들을
-읽으려 하는 상황을 생각해 봅시다:
-
-	CPU 1		CPU 2		COMMENT
-	===============	===============	=======================================
-	...
-			q = p;
-			x = *q;
-
-위의 두개의 읽기 오퍼레이션은 예상된 순서로 일어나지 못할 수 있는데, 두번째 CPU
-의 한 캐시에 다른 캐시 이벤트가 발생해 v 를 담고 있는 캐시라인의 해당 캐시에의
-업데이트가 지연되는 사이, p 를 담고 있는 캐시라인은 두번째 CPU 의 다른 캐시에
-업데이트 되어버렸을 수 있기 때문입니다.
-
-	CPU 1		CPU 2		COMMENT
-	===============	===============	=======================================
-					u == 0, v == 1 and p == &u, q == &u
-	v = 2;
-	smp_wmb();
-	<A:modify v=2>	<C:busy>
-			<C:queue v=2>
-	p = &v;		q = p;
-			<D:request p>
-	<B:modify p=&v>	<D:commit p=&v>
-			<D:read p>
-			x = *q;
-			<C:read *q>	캐시에 업데이트 되기 전의 v 를 읽음
-			<C:unbusy>
-			<C:commit v=2>
-
-기본적으로, 두개의 캐시라인 모두 CPU 2 에 최종적으로는 업데이트 될 것이지만,
-별도의 개입 없이는, 업데이트의 순서가 CPU 1 에서 만들어진 순서와 동일할
-것이라는 보장이 없습니다.
-
-
-여기에 개입하기 위해선, 데이터 의존성 배리어나 읽기 배리어를 로드 오퍼레이션들
-사이에 넣어야 합니다 (v4.15 부터는 READ_ONCE() 매크로에 의해 무조건적으로
-그렇게 됩니다).  이렇게 함으로써 캐시가 다음 요청을 처리하기 전에 일관성 큐를
-처리하도록 강제하게 됩니다.
-
-	CPU 1		CPU 2		COMMENT
-	===============	===============	=======================================
-					u == 0, v == 1 and p == &u, q == &u
-	v = 2;
-	smp_wmb();
-	<A:modify v=2>	<C:busy>
-			<C:queue v=2>
-	p = &v;		q = p;
-			<D:request p>
-	<B:modify p=&v>	<D:commit p=&v>
-			<D:read p>
-			smp_read_barrier_depends()
-			<C:unbusy>
-			<C:commit v=2>
-			x = *q;
-			<C:read *q>	캐시에 업데이트 된 v 를 읽음
-
-
-이런 부류의 문제는 DEC Alpha 계열 프로세서들에서 발견될 수 있는데, 이들은
-데이터 버스를 좀 더 잘 사용해 성능을 개선할 수 있는, 분할된 캐시를 가지고 있기
-때문입니다.  대부분의 CPU 는 하나의 읽기 오퍼레이션의 메모리 액세스가 다른 읽기
-오퍼레이션에 의존적이라면 데이터 의존성 배리어를 내포시킵니다만, 모두가 그런건
-아니기 때문에 이점에 의존해선 안됩니다.
-
-다른 CPU 들도 분할된 캐시를 가지고 있을 수 있지만, 그런 CPU 들은 평범한 메모리
-액세스를 위해서도 이 분할된 캐시들 사이의 조정을 해야만 합니다.  Alpha 는 가장
-약한 메모리 순서 시맨틱 (semantic) 을 선택함으로써 메모리 배리어가 명시적으로
-사용되지 않았을 때에는 그런 조정이 필요하지 않게 했으며, 이는 Alpha 가 당시에
-더 높은 CPU 클락 속도를 가질 수 있게 했습니다.  하지만, (다시 말하건대, v4.15
-이후부터는) Alpha 아키텍쳐 전용 코드와 READ_ONCE() 매크로 내부에서를 제외하고는
-smp_read_barrier_depends() 가 사용되지 않아야 함을 알아두시기 바랍니다.
-
-
 캐시 일관성 VS DMA
 ------------------
 
@@ -2962,10 +2824,8 @@ Alpha CPU 의 일부 버전은 분할된 데이터 캐시를 가지고 있어서
 데이터의 발견을 올바른 순서로 일어나게 하기 때문입니다.
 
 리눅스 커널의 메모리 배리어 모델은 Alpha 에 기초해서 정의되었습니다만, v4.15
-부터는 리눅스 커널이 READ_ONCE() 내에 smp_read_barrier_depends() 를 추가해서
-Alpha 의 메모리 모델로의 영향력이 크게 줄어들긴 했습니다.
-
-위의 "캐시 일관성" 서브섹션을 참고하세요.
+부터는 Alpha 용 READ_ONCE() 코드 내에 smp_mb() 가 추가되어서 메모리 모델로의
+Alpha 의 영향력이 크게 줄어들었습니다.
 
 
 가상 머신 게스트

From 628fd55671f753a1e4fe8c21b6a0553503cade08 Mon Sep 17 00:00:00 2001
From: Will Deacon <will@kernel.org>
Date: Thu, 7 Nov 2019 14:44:06 +0000
Subject: [PATCH 385/502] tools/memory-model: Remove smp_read_barrier_depends()
 from informal doc

smp_read_barrier_depends() has gone the way of mmiowb() and so many
esoteric memory barriers before it. Drop the two mentions of this
deceased barrier from the LKMM informal explanation document.

Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Alan Stern <stern@rowland.harvard.edu>
Acked-by: Paul E. McKenney <paulmck@kernel.org>
Signed-off-by: Will Deacon <will@kernel.org>
---
 .../Documentation/explanation.txt             | 26 +++++++++----------
 1 file changed, 12 insertions(+), 14 deletions(-)

diff --git a/tools/memory-model/Documentation/explanation.txt b/tools/memory-model/Documentation/explanation.txt
index e91a2eb19592..01adf9e0ebac 100644
--- a/tools/memory-model/Documentation/explanation.txt
+++ b/tools/memory-model/Documentation/explanation.txt
@@ -1122,12 +1122,10 @@ maintain at least the appearance of FIFO order.
 In practice, this difficulty is solved by inserting a special fence
 between P1's two loads when the kernel is compiled for the Alpha
 architecture.  In fact, as of version 4.15, the kernel automatically
-adds this fence (called smp_read_barrier_depends() and defined as
-nothing at all on non-Alpha builds) after every READ_ONCE() and atomic
-load.  The effect of the fence is to cause the CPU not to execute any
-po-later instructions until after the local cache has finished
-processing all the stores it has already received.  Thus, if the code
-was changed to:
+adds this fence after every READ_ONCE() and atomic load on Alpha.  The
+effect of the fence is to cause the CPU not to execute any po-later
+instructions until after the local cache has finished processing all
+the stores it has already received.  Thus, if the code was changed to:
 
 	P1()
 	{
@@ -1146,14 +1144,14 @@ READ_ONCE() or another synchronization primitive rather than accessed
 directly.
 
 The LKMM requires that smp_rmb(), acquire fences, and strong fences
-share this property with smp_read_barrier_depends(): They do not allow
-the CPU to execute any po-later instructions (or po-later loads in the
-case of smp_rmb()) until all outstanding stores have been processed by
-the local cache.  In the case of a strong fence, the CPU first has to
-wait for all of its po-earlier stores to propagate to every other CPU
-in the system; then it has to wait for the local cache to process all
-the stores received as of that time -- not just the stores received
-when the strong fence began.
+share this property: They do not allow the CPU to execute any po-later
+instructions (or po-later loads in the case of smp_rmb()) until all
+outstanding stores have been processed by the local cache.  In the
+case of a strong fence, the CPU first has to wait for all of its
+po-earlier stores to propagate to every other CPU in the system; then
+it has to wait for the local cache to process all the stores received
+as of that time -- not just the stores received when the strong fence
+began.
 
 And of course, none of this matters for any architecture other than
 Alpha.

From c6cd2e011655aead2097273a04350f52429a1a8d Mon Sep 17 00:00:00 2001
From: Will Deacon <will@kernel.org>
Date: Thu, 7 Nov 2019 14:46:59 +0000
Subject: [PATCH 386/502] include/linux: Remove smp_read_barrier_depends() from
 comments

smp_read_barrier_depends() doesn't exist any more, so reword the two
comments that mention it to refer to "dependency ordering" instead.

Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Paul E. McKenney <paulmck@kernel.org>
Signed-off-by: Will Deacon <will@kernel.org>
---
 include/linux/percpu-refcount.h | 2 +-
 include/linux/ptr_ring.h        | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/include/linux/percpu-refcount.h b/include/linux/percpu-refcount.h
index 22d9d183950d..87d8a38bdea1 100644
--- a/include/linux/percpu-refcount.h
+++ b/include/linux/percpu-refcount.h
@@ -155,7 +155,7 @@ static inline bool __ref_is_percpu(struct percpu_ref *ref,
 	 * between contaminating the pointer value, meaning that
 	 * READ_ONCE() is required when fetching it.
 	 *
-	 * The smp_read_barrier_depends() implied by READ_ONCE() pairs
+	 * The dependency ordering from the READ_ONCE() pairs
 	 * with smp_store_release() in __percpu_ref_switch_to_percpu().
 	 */
 	percpu_ptr = READ_ONCE(ref->percpu_count_ptr);
diff --git a/include/linux/ptr_ring.h b/include/linux/ptr_ring.h
index 417db0a79a62..808f9d3ee546 100644
--- a/include/linux/ptr_ring.h
+++ b/include/linux/ptr_ring.h
@@ -107,7 +107,7 @@ static inline int __ptr_ring_produce(struct ptr_ring *r, void *ptr)
 		return -ENOSPC;
 
 	/* Make sure the pointer we are storing points to a valid data. */
-	/* Pairs with smp_read_barrier_depends in __ptr_ring_consume. */
+	/* Pairs with the dependency ordering in __ptr_ring_consume. */
 	smp_wmb();
 
 	WRITE_ONCE(r->queue[r->producer++], ptr);

From ad83ec6ce13618a8b975ffdd8291742cb5b0005b Mon Sep 17 00:00:00 2001
From: Will Deacon <will@kernel.org>
Date: Thu, 7 Nov 2019 14:49:00 +0000
Subject: [PATCH 387/502] checkpatch: Remove checks relating to
 [smp_]read_barrier_depends()

The [smp_]read_barrier_depends() macros no longer exist, so we don't
need to deal with them in the checkpatch script.

Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Paul E. McKenney <paulmck@kernel.org>
Signed-off-by: Will Deacon <will@kernel.org>
---
 scripts/checkpatch.pl | 9 +--------
 1 file changed, 1 insertion(+), 8 deletions(-)

diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl
index 4c820607540b..8032f80c5bc7 100755
--- a/scripts/checkpatch.pl
+++ b/scripts/checkpatch.pl
@@ -5903,8 +5903,7 @@ sub process {
 		my $barriers = qr{
 			mb|
 			rmb|
-			wmb|
-			read_barrier_depends
+			wmb
 		}x;
 		my $barrier_stems = qr{
 			mb__before_atomic|
@@ -5953,12 +5952,6 @@ sub process {
 			}
 		}
 
-# check for smp_read_barrier_depends and read_barrier_depends
-		if (!$file && $line =~ /\b(smp_|)read_barrier_depends\s*\(/) {
-			WARN("READ_BARRIER_DEPENDS",
-			     "$1read_barrier_depends should only be used in READ_ONCE or DEC Alpha code\n" . $herecurr);
-		}
-
 # check of hardware specific defines
 		if ($line =~ m@^.\s*\#\s*if.*\b(__i386__|__powerpc64__|__sun__|__s390x__)\b@ && $realfile !~ m@include/asm-@) {
 			CHK("ARCH_DEFINES",

From eb5c2d4b45e3d2d5d052ea6b8f1463976b1020d5 Mon Sep 17 00:00:00 2001
From: Will Deacon <will@kernel.org>
Date: Tue, 21 Jul 2020 09:54:15 +0100
Subject: [PATCH 388/502] compiler.h: Move compiletime_assert() macros into
 compiler_types.h

The kernel test robot reports that moving READ_ONCE() out into its own
header breaks a W=1 build for parisc, which is relying on the definition
of compiletime_assert() being available:

  | In file included from ./arch/parisc/include/generated/asm/rwonce.h:1,
  |                  from ./include/asm-generic/barrier.h:16,
  |                  from ./arch/parisc/include/asm/barrier.h:29,
  |                  from ./arch/parisc/include/asm/atomic.h:11,
  |                  from ./include/linux/atomic.h:7,
  |                  from kernel/locking/percpu-rwsem.c:2:
  | ./arch/parisc/include/asm/atomic.h: In function 'atomic_read':
  | ./include/asm-generic/rwonce.h:36:2: error: implicit declaration of function 'compiletime_assert' [-Werror=implicit-function-declaration]
  |    36 |  compiletime_assert(__native_word(t) || sizeof(t) == sizeof(long long), \
  |       |  ^~~~~~~~~~~~~~~~~~
  | ./include/asm-generic/rwonce.h:49:2: note: in expansion of macro 'compiletime_assert_rwonce_type'
  |    49 |  compiletime_assert_rwonce_type(x);    \
  |       |  ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  | ./arch/parisc/include/asm/atomic.h:73:9: note: in expansion of macro 'READ_ONCE'
  |    73 |  return READ_ONCE((v)->counter);
  |       |         ^~~~~~~~~

Move these macros into compiler_types.h, so that they are available to
READ_ONCE() and friends.

Link: http://lists.infradead.org/pipermail/linux-arm-kernel/2020-July/587094.html
Reported-by: kernel test robot <lkp@intel.com>
Signed-off-by: Will Deacon <will@kernel.org>
---
 include/linux/compiler.h       | 41 ----------------------------------
 include/linux/compiler_types.h | 41 ++++++++++++++++++++++++++++++++++
 2 files changed, 41 insertions(+), 41 deletions(-)

diff --git a/include/linux/compiler.h b/include/linux/compiler.h
index f075a3df4fe2..59f7194fdf08 100644
--- a/include/linux/compiler.h
+++ b/include/linux/compiler.h
@@ -273,47 +273,6 @@ static inline void *offset_to_ptr(const int *off)
 
 #endif /* __ASSEMBLY__ */
 
-/* Compile time object size, -1 for unknown */
-#ifndef __compiletime_object_size
-# define __compiletime_object_size(obj) -1
-#endif
-#ifndef __compiletime_warning
-# define __compiletime_warning(message)
-#endif
-#ifndef __compiletime_error
-# define __compiletime_error(message)
-#endif
-
-#ifdef __OPTIMIZE__
-# define __compiletime_assert(condition, msg, prefix, suffix)		\
-	do {								\
-		extern void prefix ## suffix(void) __compiletime_error(msg); \
-		if (!(condition))					\
-			prefix ## suffix();				\
-	} while (0)
-#else
-# define __compiletime_assert(condition, msg, prefix, suffix) do { } while (0)
-#endif
-
-#define _compiletime_assert(condition, msg, prefix, suffix) \
-	__compiletime_assert(condition, msg, prefix, suffix)
-
-/**
- * compiletime_assert - break build and emit msg if condition is false
- * @condition: a compile-time constant condition to check
- * @msg:       a message to emit if condition is false
- *
- * In tradition of POSIX assert, this macro will break the build if the
- * supplied condition is *false*, emitting the supplied error message if the
- * compiler has support to do so.
- */
-#define compiletime_assert(condition, msg) \
-	_compiletime_assert(condition, msg, __compiletime_assert_, __COUNTER__)
-
-#define compiletime_assert_atomic_type(t)				\
-	compiletime_assert(__native_word(t),				\
-		"Need native word sized stores/loads for atomicity.")
-
 /* &a[0] degrades to a pointer: a different type from an array */
 #define __must_be_array(a)	BUILD_BUG_ON_ZERO(__same_type((a), &(a)[0]))
 
diff --git a/include/linux/compiler_types.h b/include/linux/compiler_types.h
index c3bf7710f69a..d9bbb62a3e2a 100644
--- a/include/linux/compiler_types.h
+++ b/include/linux/compiler_types.h
@@ -300,6 +300,47 @@ struct ftrace_likely_data {
 	(sizeof(t) == sizeof(char) || sizeof(t) == sizeof(short) || \
 	 sizeof(t) == sizeof(int) || sizeof(t) == sizeof(long))
 
+/* Compile time object size, -1 for unknown */
+#ifndef __compiletime_object_size
+# define __compiletime_object_size(obj) -1
+#endif
+#ifndef __compiletime_warning
+# define __compiletime_warning(message)
+#endif
+#ifndef __compiletime_error
+# define __compiletime_error(message)
+#endif
+
+#ifdef __OPTIMIZE__
+# define __compiletime_assert(condition, msg, prefix, suffix)		\
+	do {								\
+		extern void prefix ## suffix(void) __compiletime_error(msg); \
+		if (!(condition))					\
+			prefix ## suffix();				\
+	} while (0)
+#else
+# define __compiletime_assert(condition, msg, prefix, suffix) do { } while (0)
+#endif
+
+#define _compiletime_assert(condition, msg, prefix, suffix) \
+	__compiletime_assert(condition, msg, prefix, suffix)
+
+/**
+ * compiletime_assert - break build and emit msg if condition is false
+ * @condition: a compile-time constant condition to check
+ * @msg:       a message to emit if condition is false
+ *
+ * In tradition of POSIX assert, this macro will break the build if the
+ * supplied condition is *false*, emitting the supplied error message if the
+ * compiler has support to do so.
+ */
+#define compiletime_assert(condition, msg) \
+	_compiletime_assert(condition, msg, __compiletime_assert_, __COUNTER__)
+
+#define compiletime_assert_atomic_type(t)				\
+	compiletime_assert(__native_word(t),				\
+		"Need native word sized stores/loads for atomicity.")
+
 /* Helpers for emitting diagnostics in pragmas. */
 #ifndef __diag
 #define __diag(string)

From 5f1f7f6c205a2e7f1d92229ac358254bd2826c2d Mon Sep 17 00:00:00 2001
From: Will Deacon <will@kernel.org>
Date: Tue, 30 Jun 2020 13:53:07 +0100
Subject: [PATCH 389/502] arm64: Reduce the number of header files pulled into
 vmlinux.lds.S

Although vmlinux.lds.S smells like an assembly file and is compiled
with __ASSEMBLY__ defined, it's actually just fed to the preprocessor to
create our linker script. This means that any assembly macros defined
by headers that it includes will result in a helpful link error:

| aarch64-linux-gnu-ld:./arch/arm64/kernel/vmlinux.lds:1: syntax error

In preparation for an arm64-private asm/rwonce.h implementation, which
will end up pulling assembly macros into linux/compiler.h, reduce the
number of headers we include directly and transitively in vmlinux.lds.S

Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Will Deacon <will@kernel.org>
---
 arch/arm64/include/asm/kernel-pgtable.h |  2 +-
 arch/arm64/include/asm/memory.h         | 11 ++++++-----
 arch/arm64/include/asm/uaccess.h        |  1 +
 arch/arm64/kernel/entry.S               |  1 +
 arch/arm64/kernel/vmlinux.lds.S         |  1 -
 arch/arm64/kvm/hyp-init.S               |  1 +
 6 files changed, 10 insertions(+), 7 deletions(-)

diff --git a/arch/arm64/include/asm/kernel-pgtable.h b/arch/arm64/include/asm/kernel-pgtable.h
index 3bf626f6fe0c..329fb15f6bac 100644
--- a/arch/arm64/include/asm/kernel-pgtable.h
+++ b/arch/arm64/include/asm/kernel-pgtable.h
@@ -8,7 +8,7 @@
 #ifndef __ASM_KERNEL_PGTABLE_H
 #define __ASM_KERNEL_PGTABLE_H
 
-#include <linux/pgtable.h>
+#include <asm/pgtable-hwdef.h>
 #include <asm/sparsemem.h>
 
 /*
diff --git a/arch/arm64/include/asm/memory.h b/arch/arm64/include/asm/memory.h
index a1871bb32bb1..9d4bf58cf7b3 100644
--- a/arch/arm64/include/asm/memory.h
+++ b/arch/arm64/include/asm/memory.h
@@ -10,11 +10,8 @@
 #ifndef __ASM_MEMORY_H
 #define __ASM_MEMORY_H
 
-#include <linux/compiler.h>
 #include <linux/const.h>
 #include <linux/sizes.h>
-#include <linux/types.h>
-#include <asm/bug.h>
 #include <asm/page-def.h>
 
 /*
@@ -157,11 +154,15 @@
 #endif
 
 #ifndef __ASSEMBLY__
-extern u64			vabits_actual;
-#define PAGE_END		(_PAGE_END(vabits_actual))
 
 #include <linux/bitops.h>
+#include <linux/compiler.h>
 #include <linux/mmdebug.h>
+#include <linux/types.h>
+#include <asm/bug.h>
+
+extern u64			vabits_actual;
+#define PAGE_END		(_PAGE_END(vabits_actual))
 
 extern s64			physvirt_offset;
 extern s64			memstart_addr;
diff --git a/arch/arm64/include/asm/uaccess.h b/arch/arm64/include/asm/uaccess.h
index bc5c7b091152..8d7c466f809b 100644
--- a/arch/arm64/include/asm/uaccess.h
+++ b/arch/arm64/include/asm/uaccess.h
@@ -19,6 +19,7 @@
 #include <linux/string.h>
 
 #include <asm/cpufeature.h>
+#include <asm/mmu.h>
 #include <asm/ptrace.h>
 #include <asm/memory.h>
 #include <asm/extable.h>
diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S
index 5304d193c79d..b668aad3b762 100644
--- a/arch/arm64/kernel/entry.S
+++ b/arch/arm64/kernel/entry.S
@@ -15,6 +15,7 @@
 #include <asm/assembler.h>
 #include <asm/asm-offsets.h>
 #include <asm/asm_pointer_auth.h>
+#include <asm/bug.h>
 #include <asm/cpufeature.h>
 #include <asm/errno.h>
 #include <asm/esr.h>
diff --git a/arch/arm64/kernel/vmlinux.lds.S b/arch/arm64/kernel/vmlinux.lds.S
index 6827da7f3aa5..e1e7c0431b4d 100644
--- a/arch/arm64/kernel/vmlinux.lds.S
+++ b/arch/arm64/kernel/vmlinux.lds.S
@@ -10,7 +10,6 @@
 #include <asm-generic/vmlinux.lds.h>
 #include <asm/cache.h>
 #include <asm/kernel-pgtable.h>
-#include <asm/thread_info.h>
 #include <asm/memory.h>
 #include <asm/page.h>
 
diff --git a/arch/arm64/kvm/hyp-init.S b/arch/arm64/kvm/hyp-init.S
index 6e6ed5581eed..076544393c3c 100644
--- a/arch/arm64/kvm/hyp-init.S
+++ b/arch/arm64/kvm/hyp-init.S
@@ -6,6 +6,7 @@
 
 #include <linux/linkage.h>
 
+#include <asm/alternative.h>
 #include <asm/assembler.h>
 #include <asm/kvm_arm.h>
 #include <asm/kvm_mmu.h>

From 55fdc1f44cd6bb1d61c9ca946d8f7cd67ea0bf36 Mon Sep 17 00:00:00 2001
From: Shaokun Zhang <zhangshaokun@hisilicon.com>
Date: Tue, 21 Jul 2020 18:49:33 +0800
Subject: [PATCH 390/502] arm64: perf: Expose some new events via sysfs

Some new PMU events can been detected by PMCEID1_EL0, but it can't
be listed, Let's expose these through sysfs.

Signed-off-by: Shaokun Zhang <zhangshaokun@hisilicon.com>
Cc: Will Deacon <will@kernel.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Link: https://lore.kernel.org/r/1595328573-12751-2-git-send-email-zhangshaokun@hisilicon.com
Signed-off-by: Will Deacon <will@kernel.org>
---
 arch/arm64/include/asm/perf_event.h | 27 +++++++++++++++++++++++++++
 arch/arm64/kernel/perf_event.c      | 19 +++++++++++++++++++
 2 files changed, 46 insertions(+)

diff --git a/arch/arm64/include/asm/perf_event.h b/arch/arm64/include/asm/perf_event.h
index e7765b62c712..2c2d7dbe8a02 100644
--- a/arch/arm64/include/asm/perf_event.h
+++ b/arch/arm64/include/asm/perf_event.h
@@ -72,6 +72,13 @@
 #define ARMV8_PMUV3_PERFCTR_LL_CACHE_RD				0x36
 #define ARMV8_PMUV3_PERFCTR_LL_CACHE_MISS_RD			0x37
 #define ARMV8_PMUV3_PERFCTR_REMOTE_ACCESS_RD			0x38
+#define ARMV8_PMUV3_PERFCTR_L1D_CACHE_LMISS_RD			0x39
+#define ARMV8_PMUV3_PERFCTR_OP_RETIRED				0x3A
+#define ARMV8_PMUV3_PERFCTR_OP_SPEC				0x3B
+#define ARMV8_PMUV3_PERFCTR_STALL				0x3C
+#define ARMV8_PMUV3_PERFCTR_STALL_SLOT_BACKEND			0x3D
+#define ARMV8_PMUV3_PERFCTR_STALL_SLOT_FRONTEND			0x3E
+#define ARMV8_PMUV3_PERFCTR_STALL_SLOT				0x3F
 
 /* Statistical profiling extension microarchitectural events */
 #define	ARMV8_SPE_PERFCTR_SAMPLE_POP				0x4000
@@ -79,6 +86,26 @@
 #define	ARMV8_SPE_PERFCTR_SAMPLE_FILTRATE			0x4002
 #define	ARMV8_SPE_PERFCTR_SAMPLE_COLLISION			0x4003
 
+/* AMUv1 architecture events */
+#define	ARMV8_AMU_PERFCTR_CNT_CYCLES				0x4004
+#define	ARMV8_AMU_PERFCTR_STALL_BACKEND_MEM			0x4005
+
+/* long-latency read miss events */
+#define	ARMV8_PMUV3_PERFCTR_L1I_CACHE_LMISS			0x4006
+#define	ARMV8_PMUV3_PERFCTR_L2D_CACHE_LMISS_RD			0x4009
+#define	ARMV8_PMUV3_PERFCTR_L2I_CACHE_LMISS			0x400A
+#define	ARMV8_PMUV3_PERFCTR_L3D_CACHE_LMISS_RD			0x400B
+
+/* additional latency from alignment events */
+#define	ARMV8_PMUV3_PERFCTR_LDST_ALIGN_LAT			0x4020
+#define	ARMV8_PMUV3_PERFCTR_LD_ALIGN_LAT			0x4021
+#define	ARMV8_PMUV3_PERFCTR_ST_ALIGN_LAT			0x4022
+
+/* Armv8.5 Memory Tagging Extension events */
+#define	ARMV8_MTE_PERFCTR_MEM_ACCESS_CHECKED			0x4024
+#define	ARMV8_MTE_PERFCTR_MEM_ACCESS_CHECKED_RD			0x4025
+#define	ARMV8_MTE_PERFCTR_MEM_ACCESS_CHECKED_WR			0x4026
+
 /* ARMv8 recommended implementation defined event types */
 #define ARMV8_IMPDEF_PERFCTR_L1D_CACHE_RD			0x40
 #define ARMV8_IMPDEF_PERFCTR_L1D_CACHE_WR			0x41
diff --git a/arch/arm64/kernel/perf_event.c b/arch/arm64/kernel/perf_event.c
index fdb6029c9021..462f9a9cc44b 100644
--- a/arch/arm64/kernel/perf_event.c
+++ b/arch/arm64/kernel/perf_event.c
@@ -225,10 +225,29 @@ static struct attribute *armv8_pmuv3_event_attrs[] = {
 	ARMV8_EVENT_ATTR(ll_cache_rd, ARMV8_PMUV3_PERFCTR_LL_CACHE_RD),
 	ARMV8_EVENT_ATTR(ll_cache_miss_rd, ARMV8_PMUV3_PERFCTR_LL_CACHE_MISS_RD),
 	ARMV8_EVENT_ATTR(remote_access_rd, ARMV8_PMUV3_PERFCTR_REMOTE_ACCESS_RD),
+	ARMV8_EVENT_ATTR(l1d_cache_lmiss_rd, ARMV8_PMUV3_PERFCTR_L1D_CACHE_LMISS_RD),
+	ARMV8_EVENT_ATTR(op_retired, ARMV8_PMUV3_PERFCTR_OP_RETIRED),
+	ARMV8_EVENT_ATTR(op_spec, ARMV8_PMUV3_PERFCTR_OP_SPEC),
+	ARMV8_EVENT_ATTR(stall, ARMV8_PMUV3_PERFCTR_STALL),
+	ARMV8_EVENT_ATTR(stall_slot_backend, ARMV8_PMUV3_PERFCTR_STALL_SLOT_BACKEND),
+	ARMV8_EVENT_ATTR(stall_slot_frontend, ARMV8_PMUV3_PERFCTR_STALL_SLOT_FRONTEND),
+	ARMV8_EVENT_ATTR(stall_slot, ARMV8_PMUV3_PERFCTR_STALL_SLOT),
 	ARMV8_EVENT_ATTR(sample_pop, ARMV8_SPE_PERFCTR_SAMPLE_POP),
 	ARMV8_EVENT_ATTR(sample_feed, ARMV8_SPE_PERFCTR_SAMPLE_FEED),
 	ARMV8_EVENT_ATTR(sample_filtrate, ARMV8_SPE_PERFCTR_SAMPLE_FILTRATE),
 	ARMV8_EVENT_ATTR(sample_collision, ARMV8_SPE_PERFCTR_SAMPLE_COLLISION),
+	ARMV8_EVENT_ATTR(cnt_cycles, ARMV8_AMU_PERFCTR_CNT_CYCLES),
+	ARMV8_EVENT_ATTR(stall_backend_mem, ARMV8_AMU_PERFCTR_STALL_BACKEND_MEM),
+	ARMV8_EVENT_ATTR(l1i_cache_lmiss, ARMV8_PMUV3_PERFCTR_L1I_CACHE_LMISS),
+	ARMV8_EVENT_ATTR(l2d_cache_lmiss_rd, ARMV8_PMUV3_PERFCTR_L2D_CACHE_LMISS_RD),
+	ARMV8_EVENT_ATTR(l2i_cache_lmiss, ARMV8_PMUV3_PERFCTR_L2I_CACHE_LMISS),
+	ARMV8_EVENT_ATTR(l3d_cache_lmiss_rd, ARMV8_PMUV3_PERFCTR_L3D_CACHE_LMISS_RD),
+	ARMV8_EVENT_ATTR(ldst_align_lat, ARMV8_PMUV3_PERFCTR_LDST_ALIGN_LAT),
+	ARMV8_EVENT_ATTR(ld_align_lat, ARMV8_PMUV3_PERFCTR_LD_ALIGN_LAT),
+	ARMV8_EVENT_ATTR(st_align_lat, ARMV8_PMUV3_PERFCTR_ST_ALIGN_LAT),
+	ARMV8_EVENT_ATTR(mem_access_checked, ARMV8_MTE_PERFCTR_MEM_ACCESS_CHECKED),
+	ARMV8_EVENT_ATTR(mem_access_checked_rd, ARMV8_MTE_PERFCTR_MEM_ACCESS_CHECKED_RD),
+	ARMV8_EVENT_ATTR(mem_access_checked_wr, ARMV8_MTE_PERFCTR_MEM_ACCESS_CHECKED_WR),
 	NULL,
 };
 

From c2127e14c127de2775feefdfb1444e30a129a59f Mon Sep 17 00:00:00 2001
From: Randy Dunlap <rdunlap@infradead.org>
Date: Sat, 18 Jul 2020 17:30:27 -0700
Subject: [PATCH 391/502] perf: <linux/perf_event.h>: drop a duplicated word

Drop the repeated word "the" in a comment.

Signed-off-by: Randy Dunlap <rdunlap@infradead.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/20200719003027.20798-1-rdunlap@infradead.org
---
 include/linux/perf_event.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 3b22db08b6fb..0edd257a5916 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -366,7 +366,7 @@ struct pmu {
 	 * ->stop() with PERF_EF_UPDATE will read the counter and update
 	 *  period/count values like ->read() would.
 	 *
-	 * ->start() with PERF_EF_RELOAD will reprogram the the counter
+	 * ->start() with PERF_EF_RELOAD will reprogram the counter
 	 *  value, must be preceded by a ->stop() with PERF_EF_UPDATE.
 	 */
 	void (*start)			(struct perf_event *event, int flags);

From 2ac5413e5edca6910d2ae157187a889e94be2b62 Mon Sep 17 00:00:00 2001
From: Hu Haowen <xianfengting221@163.com>
Date: Sun, 19 Jul 2020 18:50:07 +0800
Subject: [PATCH 392/502] x86/perf: Fix a typo

The word "Zhoaxin" is incorrect and the right one is "Zhaoxin".

Signed-off-by: Hu Haowen <xianfengting221@163.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/20200719105007.57649-1-xianfengting221@163.com
---
 arch/x86/events/zhaoxin/core.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/x86/events/zhaoxin/core.c b/arch/x86/events/zhaoxin/core.c
index 898fa1ae9ceb..e68827e604ad 100644
--- a/arch/x86/events/zhaoxin/core.c
+++ b/arch/x86/events/zhaoxin/core.c
@@ -1,6 +1,6 @@
 // SPDX-License-Identifier: GPL-2.0-only
 /*
- * Zhoaxin PMU; like Intel Architectural PerfMon-v2
+ * Zhaoxin PMU; like Intel Architectural PerfMon-v2
  */
 
 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt

From 493cf9b723bcc87e9284e5e5971259951a13f22e Mon Sep 17 00:00:00 2001
From: Vladimir Murzin <vladimir.murzin@arm.com>
Date: Tue, 21 Jul 2020 10:12:59 +0100
Subject: [PATCH 393/502] arm64: s/AMEVTYPE/AMEVTYPER

Activity Monitor Event Type Registers are named as AMEVTYPER{0,1}<n>

Signed-off-by: Vladimir Murzin <vladimir.murzin@arm.com>
Acked-by: Mark Rutland <mark.rutland@arm.com>
Link: https://lore.kernel.org/r/20200721091259.102756-1-vladimir.murzin@arm.com
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 arch/arm64/include/asm/sysreg.h |  4 +-
 arch/arm64/kvm/sys_regs.c       | 68 ++++++++++++++++-----------------
 2 files changed, 36 insertions(+), 36 deletions(-)

diff --git a/arch/arm64/include/asm/sysreg.h b/arch/arm64/include/asm/sysreg.h
index 463175f80341..273bb1d15d21 100644
--- a/arch/arm64/include/asm/sysreg.h
+++ b/arch/arm64/include/asm/sysreg.h
@@ -421,9 +421,9 @@
  */
 
 #define SYS_AMEVCNTR0_EL0(n)		SYS_AM_EL0(4 + ((n) >> 3), (n) & 7)
-#define SYS_AMEVTYPE0_EL0(n)		SYS_AM_EL0(6 + ((n) >> 3), (n) & 7)
+#define SYS_AMEVTYPER0_EL0(n)		SYS_AM_EL0(6 + ((n) >> 3), (n) & 7)
 #define SYS_AMEVCNTR1_EL0(n)		SYS_AM_EL0(12 + ((n) >> 3), (n) & 7)
-#define SYS_AMEVTYPE1_EL0(n)		SYS_AM_EL0(14 + ((n) >> 3), (n) & 7)
+#define SYS_AMEVTYPER1_EL0(n)		SYS_AM_EL0(14 + ((n) >> 3), (n) & 7)
 
 /* AMU v1: Fixed (architecturally defined) activity monitors */
 #define SYS_AMEVCNTR0_CORE_EL0		SYS_AMEVCNTR0_EL0(0)
diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c
index baf5ce9225ce..d3196671c590 100644
--- a/arch/arm64/kvm/sys_regs.c
+++ b/arch/arm64/kvm/sys_regs.c
@@ -1024,9 +1024,9 @@ static bool access_amu(struct kvm_vcpu *vcpu, struct sys_reg_params *p,
 
 /* Macro to expand the AMU counter and type registers*/
 #define AMU_AMEVCNTR0_EL0(n) { SYS_DESC(SYS_AMEVCNTR0_EL0(n)), access_amu }
-#define AMU_AMEVTYPE0_EL0(n) { SYS_DESC(SYS_AMEVTYPE0_EL0(n)), access_amu }
+#define AMU_AMEVTYPER0_EL0(n) { SYS_DESC(SYS_AMEVTYPER0_EL0(n)), access_amu }
 #define AMU_AMEVCNTR1_EL0(n) { SYS_DESC(SYS_AMEVCNTR1_EL0(n)), access_amu }
-#define AMU_AMEVTYPE1_EL0(n) { SYS_DESC(SYS_AMEVTYPE1_EL0(n)), access_amu }
+#define AMU_AMEVTYPER1_EL0(n) { SYS_DESC(SYS_AMEVTYPER1_EL0(n)), access_amu }
 
 static bool trap_ptrauth(struct kvm_vcpu *vcpu,
 			 struct sys_reg_params *p,
@@ -1629,22 +1629,22 @@ static const struct sys_reg_desc sys_reg_descs[] = {
 	AMU_AMEVCNTR0_EL0(13),
 	AMU_AMEVCNTR0_EL0(14),
 	AMU_AMEVCNTR0_EL0(15),
-	AMU_AMEVTYPE0_EL0(0),
-	AMU_AMEVTYPE0_EL0(1),
-	AMU_AMEVTYPE0_EL0(2),
-	AMU_AMEVTYPE0_EL0(3),
-	AMU_AMEVTYPE0_EL0(4),
-	AMU_AMEVTYPE0_EL0(5),
-	AMU_AMEVTYPE0_EL0(6),
-	AMU_AMEVTYPE0_EL0(7),
-	AMU_AMEVTYPE0_EL0(8),
-	AMU_AMEVTYPE0_EL0(9),
-	AMU_AMEVTYPE0_EL0(10),
-	AMU_AMEVTYPE0_EL0(11),
-	AMU_AMEVTYPE0_EL0(12),
-	AMU_AMEVTYPE0_EL0(13),
-	AMU_AMEVTYPE0_EL0(14),
-	AMU_AMEVTYPE0_EL0(15),
+	AMU_AMEVTYPER0_EL0(0),
+	AMU_AMEVTYPER0_EL0(1),
+	AMU_AMEVTYPER0_EL0(2),
+	AMU_AMEVTYPER0_EL0(3),
+	AMU_AMEVTYPER0_EL0(4),
+	AMU_AMEVTYPER0_EL0(5),
+	AMU_AMEVTYPER0_EL0(6),
+	AMU_AMEVTYPER0_EL0(7),
+	AMU_AMEVTYPER0_EL0(8),
+	AMU_AMEVTYPER0_EL0(9),
+	AMU_AMEVTYPER0_EL0(10),
+	AMU_AMEVTYPER0_EL0(11),
+	AMU_AMEVTYPER0_EL0(12),
+	AMU_AMEVTYPER0_EL0(13),
+	AMU_AMEVTYPER0_EL0(14),
+	AMU_AMEVTYPER0_EL0(15),
 	AMU_AMEVCNTR1_EL0(0),
 	AMU_AMEVCNTR1_EL0(1),
 	AMU_AMEVCNTR1_EL0(2),
@@ -1661,22 +1661,22 @@ static const struct sys_reg_desc sys_reg_descs[] = {
 	AMU_AMEVCNTR1_EL0(13),
 	AMU_AMEVCNTR1_EL0(14),
 	AMU_AMEVCNTR1_EL0(15),
-	AMU_AMEVTYPE1_EL0(0),
-	AMU_AMEVTYPE1_EL0(1),
-	AMU_AMEVTYPE1_EL0(2),
-	AMU_AMEVTYPE1_EL0(3),
-	AMU_AMEVTYPE1_EL0(4),
-	AMU_AMEVTYPE1_EL0(5),
-	AMU_AMEVTYPE1_EL0(6),
-	AMU_AMEVTYPE1_EL0(7),
-	AMU_AMEVTYPE1_EL0(8),
-	AMU_AMEVTYPE1_EL0(9),
-	AMU_AMEVTYPE1_EL0(10),
-	AMU_AMEVTYPE1_EL0(11),
-	AMU_AMEVTYPE1_EL0(12),
-	AMU_AMEVTYPE1_EL0(13),
-	AMU_AMEVTYPE1_EL0(14),
-	AMU_AMEVTYPE1_EL0(15),
+	AMU_AMEVTYPER1_EL0(0),
+	AMU_AMEVTYPER1_EL0(1),
+	AMU_AMEVTYPER1_EL0(2),
+	AMU_AMEVTYPER1_EL0(3),
+	AMU_AMEVTYPER1_EL0(4),
+	AMU_AMEVTYPER1_EL0(5),
+	AMU_AMEVTYPER1_EL0(6),
+	AMU_AMEVTYPER1_EL0(7),
+	AMU_AMEVTYPER1_EL0(8),
+	AMU_AMEVTYPER1_EL0(9),
+	AMU_AMEVTYPER1_EL0(10),
+	AMU_AMEVTYPER1_EL0(11),
+	AMU_AMEVTYPER1_EL0(12),
+	AMU_AMEVTYPER1_EL0(13),
+	AMU_AMEVTYPER1_EL0(14),
+	AMU_AMEVTYPER1_EL0(15),
 
 	{ SYS_DESC(SYS_CNTP_TVAL_EL0), access_arch_timer },
 	{ SYS_DESC(SYS_CNTP_CTL_EL0), access_arch_timer },

From 58e15716feb562cdba57e99d62c525a1faa37c08 Mon Sep 17 00:00:00 2001
From: Heiko Carstens <hca@linux.ibm.com>
Date: Mon, 20 Jul 2020 14:15:02 +0200
Subject: [PATCH 394/502] s390/time: use CLOCKSOURCE_MASK

Make use of CLOCKSOURCE_MASK instead of open-coding it.

Signed-off-by: Heiko Carstens <hca@linux.ibm.com>
---
 arch/s390/kernel/time.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/s390/kernel/time.c b/arch/s390/kernel/time.c
index 700127ba689d..317059684847 100644
--- a/arch/s390/kernel/time.c
+++ b/arch/s390/kernel/time.c
@@ -253,7 +253,7 @@ static struct clocksource clocksource_tod = {
 	.name		= "tod",
 	.rating		= 400,
 	.read		= read_tod_clock,
-	.mask		= -1ULL,
+	.mask		= CLOCKSOURCE_MASK(64),
 	.mult		= 1000,
 	.shift		= 12,
 	.flags		= CLOCK_SOURCE_IS_CONTINUOUS,

From 555701a714f77e01490f633c1080cf97f0ede1f0 Mon Sep 17 00:00:00 2001
From: Heiko Carstens <hca@linux.ibm.com>
Date: Mon, 20 Jul 2020 14:16:03 +0200
Subject: [PATCH 395/502] s390/time: select CLOCKSOURCE_VALIDATE_LAST_CYCLE

The value returned by read_tod_clock() will overflow on September 17th 2042.
To avoid that system time jumps back select CLOCKSOURCE_VALIDATE_LAST_CYCLE
which enables a sanity check in order to prevent negative "delta" values.

Signed-off-by: Heiko Carstens <hca@linux.ibm.com>
---
 arch/s390/Kconfig | 1 +
 1 file changed, 1 insertion(+)

diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig
index 0df33cffec52..d95d323cf213 100644
--- a/arch/s390/Kconfig
+++ b/arch/s390/Kconfig
@@ -125,6 +125,7 @@ config S390
 	select HAVE_ARCH_JUMP_LABEL_RELATIVE
 	select HAVE_ARCH_KASAN
 	select HAVE_ARCH_KASAN_VMALLOC
+	select CLOCKSOURCE_VALIDATE_LAST_CYCLE
 	select CPU_NO_EFFICIENT_FFS if !HAVE_MARCH_Z9_109_FEATURES
 	select HAVE_ARCH_SECCOMP_FILTER
 	select HAVE_ARCH_SOFT_DIRTY

From 411155820bb348e71ecc5b1db147b36af98cbc96 Mon Sep 17 00:00:00 2001
From: Heiko Carstens <hca@linux.ibm.com>
Date: Mon, 20 Jul 2020 14:28:36 +0200
Subject: [PATCH 396/502] s390/time: improve comparison for tod steering

It doesn't make sense to add zero shifted by 15. It's still zero.

Signed-off-by: Heiko Carstens <hca@linux.ibm.com>
---
 arch/s390/kernel/time.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/s390/kernel/time.c b/arch/s390/kernel/time.c
index 317059684847..513e59d08a55 100644
--- a/arch/s390/kernel/time.c
+++ b/arch/s390/kernel/time.c
@@ -237,7 +237,7 @@ static u64 read_tod_clock(struct clocksource *cs)
 	preempt_disable(); /* protect from changes to steering parameters */
 	now = get_tod_clock();
 	adj = tod_steering_end - now;
-	if (unlikely((s64) adj >= 0))
+	if (unlikely((s64) adj > 0))
 		/*
 		 * manually steer by 1 cycle every 2^16 cycles. This
 		 * corresponds to shifting the tod delta by 15. 1s is

From ec0160891e387f4771f953b888b1fe951398e5d9 Mon Sep 17 00:00:00 2001
From: Jon Derrick <jonathan.derrick@intel.com>
Date: Tue, 21 Jul 2020 14:26:09 -0600
Subject: [PATCH 397/502] irqdomain/treewide: Free firmware node after domain
 removal

Commit 711419e504eb ("irqdomain: Add the missing assignment of
domain->fwnode for named fwnode") unintentionally caused a dangling pointer
page fault issue on firmware nodes that were freed after IRQ domain
allocation. Commit e3beca48a45b fixed that dangling pointer issue by only
freeing the firmware node after an IRQ domain allocation failure. That fix
no longer frees the firmware node immediately, but leaves the firmware node
allocated after the domain is removed.

The firmware node must be kept around through irq_domain_remove, but should be
freed it afterwards.

Add the missing free operations after domain removal where where appropriate.

Fixes: e3beca48a45b ("irqdomain/treewide: Keep firmware node unconditionally allocated")
Signed-off-by: Jon Derrick <jonathan.derrick@intel.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Acked-by: Bjorn Helgaas <bhelgaas@google.com>	# drivers/pci
Cc: stable@vger.kernel.org
Link: https://lkml.kernel.org/r/1595363169-7157-1-git-send-email-jonathan.derrick@intel.com
---
 arch/mips/pci/pci-xtalk-bridge.c    | 3 +++
 arch/x86/kernel/apic/io_apic.c      | 5 +++++
 drivers/iommu/intel/irq_remapping.c | 8 ++++++++
 drivers/mfd/ioc3.c                  | 6 ++++++
 drivers/pci/controller/vmd.c        | 3 +++
 5 files changed, 25 insertions(+)

diff --git a/arch/mips/pci/pci-xtalk-bridge.c b/arch/mips/pci/pci-xtalk-bridge.c
index 5958217861b8..9b3cc775c55e 100644
--- a/arch/mips/pci/pci-xtalk-bridge.c
+++ b/arch/mips/pci/pci-xtalk-bridge.c
@@ -728,6 +728,7 @@ err_free_resource:
 	pci_free_resource_list(&host->windows);
 err_remove_domain:
 	irq_domain_remove(domain);
+	irq_domain_free_fwnode(fn);
 	return err;
 }
 
@@ -735,8 +736,10 @@ static int bridge_remove(struct platform_device *pdev)
 {
 	struct pci_bus *bus = platform_get_drvdata(pdev);
 	struct bridge_controller *bc = BRIDGE_CONTROLLER(bus);
+	struct fwnode_handle *fn = bc->domain->fwnode;
 
 	irq_domain_remove(bc->domain);
+	irq_domain_free_fwnode(fn);
 	pci_lock_rescan_remove();
 	pci_stop_root_bus(bus);
 	pci_remove_root_bus(bus);
diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index 81ffcfbfaef2..21325a4a78b9 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -2335,8 +2335,13 @@ static int mp_irqdomain_create(int ioapic)
 
 static void ioapic_destroy_irqdomain(int idx)
 {
+	struct ioapic_domain_cfg *cfg = &ioapics[idx].irqdomain_cfg;
+	struct fwnode_handle *fn = ioapics[idx].irqdomain->fwnode;
+
 	if (ioapics[idx].irqdomain) {
 		irq_domain_remove(ioapics[idx].irqdomain);
+		if (!cfg->dev)
+			irq_domain_free_fwnode(fn);
 		ioapics[idx].irqdomain = NULL;
 	}
 }
diff --git a/drivers/iommu/intel/irq_remapping.c b/drivers/iommu/intel/irq_remapping.c
index 9564d23d094f..aa096b333a99 100644
--- a/drivers/iommu/intel/irq_remapping.c
+++ b/drivers/iommu/intel/irq_remapping.c
@@ -628,13 +628,21 @@ out_free_table:
 
 static void intel_teardown_irq_remapping(struct intel_iommu *iommu)
 {
+	struct fwnode_handle *fn;
+
 	if (iommu && iommu->ir_table) {
 		if (iommu->ir_msi_domain) {
+			fn = iommu->ir_msi_domain->fwnode;
+
 			irq_domain_remove(iommu->ir_msi_domain);
+			irq_domain_free_fwnode(fn);
 			iommu->ir_msi_domain = NULL;
 		}
 		if (iommu->ir_domain) {
+			fn = iommu->ir_domain->fwnode;
+
 			irq_domain_remove(iommu->ir_domain);
+			irq_domain_free_fwnode(fn);
 			iommu->ir_domain = NULL;
 		}
 		free_pages((unsigned long)iommu->ir_table->base,
diff --git a/drivers/mfd/ioc3.c b/drivers/mfd/ioc3.c
index 74cee7cb0afc..d939ccc46509 100644
--- a/drivers/mfd/ioc3.c
+++ b/drivers/mfd/ioc3.c
@@ -616,7 +616,10 @@ static int ioc3_mfd_probe(struct pci_dev *pdev,
 		/* Remove all already added MFD devices */
 		mfd_remove_devices(&ipd->pdev->dev);
 		if (ipd->domain) {
+			struct fwnode_handle *fn = ipd->domain->fwnode;
+
 			irq_domain_remove(ipd->domain);
+			irq_domain_free_fwnode(fn);
 			free_irq(ipd->domain_irq, (void *)ipd);
 		}
 		pci_iounmap(pdev, regs);
@@ -643,7 +646,10 @@ static void ioc3_mfd_remove(struct pci_dev *pdev)
 	/* Release resources */
 	mfd_remove_devices(&ipd->pdev->dev);
 	if (ipd->domain) {
+		struct fwnode_handle *fn = ipd->domain->fwnode;
+
 		irq_domain_remove(ipd->domain);
+		irq_domain_free_fwnode(fn);
 		free_irq(ipd->domain_irq, (void *)ipd);
 	}
 	pci_iounmap(pdev, ipd->regs);
diff --git a/drivers/pci/controller/vmd.c b/drivers/pci/controller/vmd.c
index 9a64cf90c291..ebec0a6e77ed 100644
--- a/drivers/pci/controller/vmd.c
+++ b/drivers/pci/controller/vmd.c
@@ -560,6 +560,7 @@ static int vmd_enable_domain(struct vmd_dev *vmd, unsigned long features)
 	if (!vmd->bus) {
 		pci_free_resource_list(&resources);
 		irq_domain_remove(vmd->irq_domain);
+		irq_domain_free_fwnode(fn);
 		return -ENODEV;
 	}
 
@@ -673,6 +674,7 @@ static void vmd_cleanup_srcu(struct vmd_dev *vmd)
 static void vmd_remove(struct pci_dev *dev)
 {
 	struct vmd_dev *vmd = pci_get_drvdata(dev);
+	struct fwnode_handle *fn = vmd->irq_domain->fwnode;
 
 	sysfs_remove_link(&vmd->dev->dev.kobj, "domain");
 	pci_stop_root_bus(vmd->bus);
@@ -680,6 +682,7 @@ static void vmd_remove(struct pci_dev *dev)
 	vmd_cleanup_srcu(vmd);
 	vmd_detach_resources(vmd);
 	irq_domain_remove(vmd->irq_domain);
+	irq_domain_free_fwnode(fn);
 }
 
 #ifdef CONFIG_PM_SLEEP

From 0ae3b13aab210e2a8c14371731abddfee228ae24 Mon Sep 17 00:00:00 2001
From: Ard Biesheuvel <ardb@kernel.org>
Date: Tue, 21 Jul 2020 10:33:15 +0200
Subject: [PATCH 398/502] arm64/entry: deduplicate SW PAN entry/exit routines

Factor the 12 copies of the SW PAN entry and exit code into callable
subroutines, and use alternatives patching to either emit a 'bl'
instruction to call them, or a NOP if h/w PAN is found to be available
at runtime.

Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
Link: https://lore.kernel.org/r/20200721083315.4816-1-ardb@kernel.org
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 arch/arm64/kernel/entry.S | 95 +++++++++++++++++++--------------------
 1 file changed, 47 insertions(+), 48 deletions(-)

diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S
index 5304d193c79d..7b9a7c45ef85 100644
--- a/arch/arm64/kernel/entry.S
+++ b/arch/arm64/kernel/entry.S
@@ -209,28 +209,9 @@ alternative_cb_end
 	add	x29, sp, #S_STACKFRAME
 
 #ifdef CONFIG_ARM64_SW_TTBR0_PAN
-	/*
-	 * Set the TTBR0 PAN bit in SPSR. When the exception is taken from
-	 * EL0, there is no need to check the state of TTBR0_EL1 since
-	 * accesses are always enabled.
-	 * Note that the meaning of this bit differs from the ARMv8.1 PAN
-	 * feature as all TTBR0_EL1 accesses are disabled, not just those to
-	 * user mappings.
-	 */
-alternative_if ARM64_HAS_PAN
-	b	1f				// skip TTBR0 PAN
+alternative_if_not ARM64_HAS_PAN
+	bl	__swpan_entry_el\el
 alternative_else_nop_endif
-
-	.if	\el != 0
-	mrs	x21, ttbr0_el1
-	tst	x21, #TTBR_ASID_MASK		// Check for the reserved ASID
-	orr	x23, x23, #PSR_PAN_BIT		// Set the emulated PAN in the saved SPSR
-	b.eq	1f				// TTBR0 access already disabled
-	and	x23, x23, #~PSR_PAN_BIT		// Clear the emulated PAN in the saved SPSR
-	.endif
-
-	__uaccess_ttbr0_disable x21
-1:
 #endif
 
 	stp	x22, x23, [sp, #S_PC]
@@ -284,34 +265,9 @@ alternative_else_nop_endif
 	.endif
 
 #ifdef CONFIG_ARM64_SW_TTBR0_PAN
-	/*
-	 * Restore access to TTBR0_EL1. If returning to EL0, no need for SPSR
-	 * PAN bit checking.
-	 */
-alternative_if ARM64_HAS_PAN
-	b	2f				// skip TTBR0 PAN
+alternative_if_not ARM64_HAS_PAN
+	bl	__swpan_exit_el\el
 alternative_else_nop_endif
-
-	.if	\el != 0
-	tbnz	x22, #22, 1f			// Skip re-enabling TTBR0 access if the PSR_PAN_BIT is set
-	.endif
-
-	__uaccess_ttbr0_enable x0, x1
-
-	.if	\el == 0
-	/*
-	 * Enable errata workarounds only if returning to user. The only
-	 * workaround currently required for TTBR0_EL1 changes are for the
-	 * Cavium erratum 27456 (broadcast TLBI instructions may cause I-cache
-	 * corruption).
-	 */
-	bl	post_ttbr_update_workaround
-	.endif
-1:
-	.if	\el != 0
-	and	x22, x22, #~PSR_PAN_BIT		// ARMv8.0 CPUs do not understand this bit
-	.endif
-2:
 #endif
 
 	.if	\el == 0
@@ -391,6 +347,49 @@ alternative_insn eret, nop, ARM64_UNMAP_KERNEL_AT_EL0
 	sb
 	.endm
 
+#ifdef CONFIG_ARM64_SW_TTBR0_PAN
+	/*
+	 * Set the TTBR0 PAN bit in SPSR. When the exception is taken from
+	 * EL0, there is no need to check the state of TTBR0_EL1 since
+	 * accesses are always enabled.
+	 * Note that the meaning of this bit differs from the ARMv8.1 PAN
+	 * feature as all TTBR0_EL1 accesses are disabled, not just those to
+	 * user mappings.
+	 */
+SYM_CODE_START_LOCAL(__swpan_entry_el1)
+	mrs	x21, ttbr0_el1
+	tst	x21, #TTBR_ASID_MASK		// Check for the reserved ASID
+	orr	x23, x23, #PSR_PAN_BIT		// Set the emulated PAN in the saved SPSR
+	b.eq	1f				// TTBR0 access already disabled
+	and	x23, x23, #~PSR_PAN_BIT		// Clear the emulated PAN in the saved SPSR
+SYM_INNER_LABEL(__swpan_entry_el0, SYM_L_LOCAL)
+	__uaccess_ttbr0_disable x21
+1:	ret
+SYM_CODE_END(__swpan_entry_el1)
+
+	/*
+	 * Restore access to TTBR0_EL1. If returning to EL0, no need for SPSR
+	 * PAN bit checking.
+	 */
+SYM_CODE_START_LOCAL(__swpan_exit_el1)
+	tbnz	x22, #22, 1f			// Skip re-enabling TTBR0 access if the PSR_PAN_BIT is set
+	__uaccess_ttbr0_enable x0, x1
+1:	and	x22, x22, #~PSR_PAN_BIT		// ARMv8.0 CPUs do not understand this bit
+	ret
+SYM_CODE_END(__swpan_exit_el1)
+
+SYM_CODE_START_LOCAL(__swpan_exit_el0)
+	__uaccess_ttbr0_enable x0, x1
+	/*
+	 * Enable errata workarounds only if returning to user. The only
+	 * workaround currently required for TTBR0_EL1 changes are for the
+	 * Cavium erratum 27456 (broadcast TLBI instructions may cause I-cache
+	 * corruption).
+	 */
+	b	post_ttbr_update_workaround
+SYM_CODE_END(__swpan_exit_el0)
+#endif
+
 	.macro	irq_stack_entry
 	mov	x19, sp			// preserve the original sp
 #ifdef CONFIG_SHADOW_CALL_STACK

From a46cec12f4a53ee5113f42b327cbb8d4cda074d2 Mon Sep 17 00:00:00 2001
From: Catalin Marinas <catalin.marinas@arm.com>
Date: Fri, 24 Jul 2020 10:41:31 +0100
Subject: [PATCH 399/502] arm64: Reserve HWCAP2_MTE as (1 << 18)

While MTE is not supported in the upstream kernel yet, add a comment
that HWCAP2_MTE as (1 << 18) is reserved. Glibc makes use of it for the
resolving (ifunc) of the MTE-safe string routines.

Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 arch/arm64/include/asm/hwcap.h      | 1 +
 arch/arm64/include/uapi/asm/hwcap.h | 1 +
 arch/arm64/kernel/cpuinfo.c         | 1 +
 3 files changed, 3 insertions(+)

diff --git a/arch/arm64/include/asm/hwcap.h b/arch/arm64/include/asm/hwcap.h
index d683bcbf1e7c..22f73fe09030 100644
--- a/arch/arm64/include/asm/hwcap.h
+++ b/arch/arm64/include/asm/hwcap.h
@@ -95,6 +95,7 @@
 #define KERNEL_HWCAP_DGH		__khwcap2_feature(DGH)
 #define KERNEL_HWCAP_RNG		__khwcap2_feature(RNG)
 #define KERNEL_HWCAP_BTI		__khwcap2_feature(BTI)
+/* reserved for KERNEL_HWCAP_MTE	__khwcap2_feature(MTE) */
 
 /*
  * This yields a mask that user programs can use to figure out what
diff --git a/arch/arm64/include/uapi/asm/hwcap.h b/arch/arm64/include/uapi/asm/hwcap.h
index 2d6ba1c2592e..912162f73529 100644
--- a/arch/arm64/include/uapi/asm/hwcap.h
+++ b/arch/arm64/include/uapi/asm/hwcap.h
@@ -74,5 +74,6 @@
 #define HWCAP2_DGH		(1 << 15)
 #define HWCAP2_RNG		(1 << 16)
 #define HWCAP2_BTI		(1 << 17)
+/* reserved for HWCAP2_MTE	(1 << 18) */
 
 #endif /* _UAPI__ASM_HWCAP_H */
diff --git a/arch/arm64/kernel/cpuinfo.c b/arch/arm64/kernel/cpuinfo.c
index 86637466daa8..393c6fb1f1cb 100644
--- a/arch/arm64/kernel/cpuinfo.c
+++ b/arch/arm64/kernel/cpuinfo.c
@@ -93,6 +93,7 @@ static const char *const hwcap_str[] = {
 	"dgh",
 	"rng",
 	"bti",
+	/* reserved for "mte" */
 	NULL
 };
 

From ea0eada45632f4807b2f49de951072283e2d781c Mon Sep 17 00:00:00 2001
From: Gregory Herrero <gregory.herrero@oracle.com>
Date: Fri, 17 Jul 2020 16:33:38 +0200
Subject: [PATCH 400/502] recordmcount: only record relocation of type
 R_AARCH64_CALL26 on arm64.

Currently, if a section has a relocation to '_mcount' symbol, a new
__mcount_loc entry will be added whatever the relocation type is.
This is problematic when a relocation to '_mcount' is in the middle of a
section and is not a call for ftrace use.

Such relocation could be generated with below code for example:
    bool is_mcount(unsigned long addr)
    {
        return (target == (unsigned long) &_mcount);
    }

With this snippet of code, ftrace will try to patch the mcount location
generated by this code on module load and fail with:

    Call trace:
     ftrace_bug+0xa0/0x28c
     ftrace_process_locs+0x2f4/0x430
     ftrace_module_init+0x30/0x38
     load_module+0x14f0/0x1e78
     __do_sys_finit_module+0x100/0x11c
     __arm64_sys_finit_module+0x28/0x34
     el0_svc_common+0x88/0x194
     el0_svc_handler+0x38/0x8c
     el0_svc+0x8/0xc
    ---[ end trace d828d06b36ad9d59 ]---
    ftrace failed to modify
    [<ffffa2dbf3a3a41c>] 0xffffa2dbf3a3a41c
     actual:   66:a9:3c:90
    Initializing ftrace call sites
    ftrace record flags: 2000000
     (0)
    expected tramp: ffffa2dc6cf66724

So Limit the relocation type to R_AARCH64_CALL26 as in perl version of
recordmcount.

Fixes: af64d2aa872a ("ftrace: Add arm64 support to recordmcount")
Signed-off-by: Gregory Herrero <gregory.herrero@oracle.com>
Acked-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
Link: https://lore.kernel.org/r/20200717143338.19302-1-gregory.herrero@oracle.com
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 scripts/recordmcount.c | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/scripts/recordmcount.c b/scripts/recordmcount.c
index 7225107a9aaf..e59022b3f125 100644
--- a/scripts/recordmcount.c
+++ b/scripts/recordmcount.c
@@ -434,6 +434,11 @@ static int arm_is_fake_mcount(Elf32_Rel const *rp)
 	return 1;
 }
 
+static int arm64_is_fake_mcount(Elf64_Rel const *rp)
+{
+	return ELF64_R_TYPE(w(rp->r_info)) != R_AARCH64_CALL26;
+}
+
 /* 64-bit EM_MIPS has weird ELF64_Rela.r_info.
  * http://techpubs.sgi.com/library/manuals/4000/007-4658-001/pdf/007-4658-001.pdf
  * We interpret Table 29 Relocation Operation (Elf64_Rel, Elf64_Rela) [p.40]
@@ -547,6 +552,7 @@ static int do_file(char const *const fname)
 		make_nop = make_nop_arm64;
 		rel_type_nop = R_AARCH64_NONE;
 		ideal_nop = ideal_nop4_arm64;
+		is_fake_mcount64 = arm64_is_fake_mcount;
 		break;
 	case EM_IA_64:	reltype = R_IA64_IMM64; break;
 	case EM_MIPS:	/* reltype: e_class    */ break;

From d19e789f068b3d633cbac430764962f404198022 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@kernel.org>
Date: Fri, 24 Jul 2020 13:50:25 +0200
Subject: [PATCH 401/502] compiler.h: Move instrumentation_begin()/end() to new
 <linux/instrumentation.h> header

Linus pointed out that compiler.h - which is a key header that gets included in every
single one of the 28,000+ kernel files during a kernel build - was bloated in:

  655389666643: ("vmlinux.lds.h: Create section for protection against instrumentation")

Linus noted:

 > I have pulled this, but do we really want to add this to a header file
 > that is _so_ core that it gets included for basically every single
 > file built?
 >
 > I don't even see those instrumentation_begin/end() things used
 > anywhere right now.
 >
 > It seems excessive. That 53 lines is maybe not a lot, but it pushed
 > that header file to over 12kB, and while it's mostly comments, it's
 > extra IO and parsing basically for _every_ single file compiled in the
 > kernel.
 >
 > For what appears to be absolutely zero upside right now, and I really
 > don't see why this should be in such a core header file!

Move these primitives into a new header: <linux/instrumentation.h>, and include that
header in the headers that make use of it.

Unfortunately one of these headers is asm-generic/bug.h, which does get included
in a lot of places, similarly to compiler.h. So the de-bloating effect isn't as
good as we'd like it to be - but at least the interfaces are defined separately.

No change to functionality intended.

Reported-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Link: https://lore.kernel.org/r/20200604071921.GA1361070@gmail.com
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Peter Zijlstra <peterz@infradead.org>
---
 arch/x86/include/asm/bug.h       |  1 +
 include/asm-generic/bug.h        |  1 +
 include/linux/compiler.h         | 53 -----------------------------
 include/linux/context_tracking.h |  2 ++
 include/linux/instrumentation.h  | 57 ++++++++++++++++++++++++++++++++
 5 files changed, 61 insertions(+), 53 deletions(-)
 create mode 100644 include/linux/instrumentation.h

diff --git a/arch/x86/include/asm/bug.h b/arch/x86/include/asm/bug.h
index 028189575560..297fa12e7e27 100644
--- a/arch/x86/include/asm/bug.h
+++ b/arch/x86/include/asm/bug.h
@@ -3,6 +3,7 @@
 #define _ASM_X86_BUG_H
 
 #include <linux/stringify.h>
+#include <linux/instrumentation.h>
 
 /*
  * Despite that some emulators terminate on UD2, we use it for WARN().
diff --git a/include/asm-generic/bug.h b/include/asm-generic/bug.h
index c94e33ae3e7b..18b0f4eee8cb 100644
--- a/include/asm-generic/bug.h
+++ b/include/asm-generic/bug.h
@@ -3,6 +3,7 @@
 #define _ASM_GENERIC_BUG_H
 
 #include <linux/compiler.h>
+#include <linux/instrumentation.h>
 
 #define CUT_HERE		"------------[ cut here ]------------\n"
 
diff --git a/include/linux/compiler.h b/include/linux/compiler.h
index 204e76856435..681894bfde99 100644
--- a/include/linux/compiler.h
+++ b/include/linux/compiler.h
@@ -120,65 +120,12 @@ void ftrace_likely_update(struct ftrace_likely_data *f, int val,
 /* Annotate a C jump table to allow objtool to follow the code flow */
 #define __annotate_jump_table __section(.rodata..c_jump_table)
 
-#ifdef CONFIG_DEBUG_ENTRY
-/* Begin/end of an instrumentation safe region */
-#define instrumentation_begin() ({					\
-	asm volatile("%c0: nop\n\t"						\
-		     ".pushsection .discard.instr_begin\n\t"		\
-		     ".long %c0b - .\n\t"				\
-		     ".popsection\n\t" : : "i" (__COUNTER__));		\
-})
-
-/*
- * Because instrumentation_{begin,end}() can nest, objtool validation considers
- * _begin() a +1 and _end() a -1 and computes a sum over the instructions.
- * When the value is greater than 0, we consider instrumentation allowed.
- *
- * There is a problem with code like:
- *
- * noinstr void foo()
- * {
- *	instrumentation_begin();
- *	...
- *	if (cond) {
- *		instrumentation_begin();
- *		...
- *		instrumentation_end();
- *	}
- *	bar();
- *	instrumentation_end();
- * }
- *
- * If instrumentation_end() would be an empty label, like all the other
- * annotations, the inner _end(), which is at the end of a conditional block,
- * would land on the instruction after the block.
- *
- * If we then consider the sum of the !cond path, we'll see that the call to
- * bar() is with a 0-value, even though, we meant it to happen with a positive
- * value.
- *
- * To avoid this, have _end() be a NOP instruction, this ensures it will be
- * part of the condition block and does not escape.
- */
-#define instrumentation_end() ({					\
-	asm volatile("%c0: nop\n\t"					\
-		     ".pushsection .discard.instr_end\n\t"		\
-		     ".long %c0b - .\n\t"				\
-		     ".popsection\n\t" : : "i" (__COUNTER__));		\
-})
-#endif /* CONFIG_DEBUG_ENTRY */
-
 #else
 #define annotate_reachable()
 #define annotate_unreachable()
 #define __annotate_jump_table
 #endif
 
-#ifndef instrumentation_begin
-#define instrumentation_begin()		do { } while(0)
-#define instrumentation_end()		do { } while(0)
-#endif
-
 #ifndef ASM_UNREACHABLE
 # define ASM_UNREACHABLE
 #endif
diff --git a/include/linux/context_tracking.h b/include/linux/context_tracking.h
index 981b880d5b60..d53cd331c4dd 100644
--- a/include/linux/context_tracking.h
+++ b/include/linux/context_tracking.h
@@ -5,6 +5,8 @@
 #include <linux/sched.h>
 #include <linux/vtime.h>
 #include <linux/context_tracking_state.h>
+#include <linux/instrumentation.h>
+
 #include <asm/ptrace.h>
 
 
diff --git a/include/linux/instrumentation.h b/include/linux/instrumentation.h
new file mode 100644
index 000000000000..93e2ad67fc10
--- /dev/null
+++ b/include/linux/instrumentation.h
@@ -0,0 +1,57 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __LINUX_INSTRUMENTATION_H
+#define __LINUX_INSTRUMENTATION_H
+
+#if defined(CONFIG_DEBUG_ENTRY) && defined(CONFIG_STACK_VALIDATION)
+
+/* Begin/end of an instrumentation safe region */
+#define instrumentation_begin() ({					\
+	asm volatile("%c0: nop\n\t"						\
+		     ".pushsection .discard.instr_begin\n\t"		\
+		     ".long %c0b - .\n\t"				\
+		     ".popsection\n\t" : : "i" (__COUNTER__));		\
+})
+
+/*
+ * Because instrumentation_{begin,end}() can nest, objtool validation considers
+ * _begin() a +1 and _end() a -1 and computes a sum over the instructions.
+ * When the value is greater than 0, we consider instrumentation allowed.
+ *
+ * There is a problem with code like:
+ *
+ * noinstr void foo()
+ * {
+ *	instrumentation_begin();
+ *	...
+ *	if (cond) {
+ *		instrumentation_begin();
+ *		...
+ *		instrumentation_end();
+ *	}
+ *	bar();
+ *	instrumentation_end();
+ * }
+ *
+ * If instrumentation_end() would be an empty label, like all the other
+ * annotations, the inner _end(), which is at the end of a conditional block,
+ * would land on the instruction after the block.
+ *
+ * If we then consider the sum of the !cond path, we'll see that the call to
+ * bar() is with a 0-value, even though, we meant it to happen with a positive
+ * value.
+ *
+ * To avoid this, have _end() be a NOP instruction, this ensures it will be
+ * part of the condition block and does not escape.
+ */
+#define instrumentation_end() ({					\
+	asm volatile("%c0: nop\n\t"					\
+		     ".pushsection .discard.instr_end\n\t"		\
+		     ".long %c0b - .\n\t"				\
+		     ".popsection\n\t" : : "i" (__COUNTER__));		\
+})
+#else
+# define instrumentation_begin()	do { } while(0)
+# define instrumentation_end()		do { } while(0)
+#endif
+
+#endif /* __LINUX_INSTRUMENTATION_H */

From d53b5c013e1e7c3b43a7487171a84ee2acdd9597 Mon Sep 17 00:00:00 2001
From: Andrei Vagin <avagin@gmail.com>
Date: Wed, 24 Jun 2020 01:33:16 -0700
Subject: [PATCH 402/502] arm64/vdso: use the fault callback to map vvar pages

Currently the vdso has no awareness of time namespaces, which may
apply distinct offsets to processes in different namespaces. To handle
this within the vdso, we'll need to expose a per-namespace data page.

As a preparatory step, this patch separates the vdso data page from
the code pages, and has it faulted in via its own fault callback.
Subsquent patches will extend this to support distinct pages per time
namespace.

The vvar vma has to be installed with the VM_PFNMAP flag to handle
faults via its vma fault callback.

Signed-off-by: Andrei Vagin <avagin@gmail.com>
Reviewed-by: Vincenzo Frascino <vincenzo.frascino@arm.com>
Reviewed-by: Dmitry Safonov <dima@arista.com>
Link: https://lore.kernel.org/r/20200624083321.144975-2-avagin@gmail.com
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 arch/arm64/kernel/vdso.c | 25 +++++++++++++++----------
 1 file changed, 15 insertions(+), 10 deletions(-)

diff --git a/arch/arm64/kernel/vdso.c b/arch/arm64/kernel/vdso.c
index e546df0efefb..eb7798e5eb00 100644
--- a/arch/arm64/kernel/vdso.c
+++ b/arch/arm64/kernel/vdso.c
@@ -107,29 +107,32 @@ static int __vdso_init(enum vdso_abi abi)
 			vdso_info[abi].vdso_code_start) >>
 			PAGE_SHIFT;
 
-	/* Allocate the vDSO pagelist, plus a page for the data. */
-	vdso_pagelist = kcalloc(vdso_info[abi].vdso_pages + 1,
+	vdso_pagelist = kcalloc(vdso_info[abi].vdso_pages,
 				sizeof(struct page *),
 				GFP_KERNEL);
 	if (vdso_pagelist == NULL)
 		return -ENOMEM;
 
-	/* Grab the vDSO data page. */
-	vdso_pagelist[0] = phys_to_page(__pa_symbol(vdso_data));
-
-
 	/* Grab the vDSO code pages. */
 	pfn = sym_to_pfn(vdso_info[abi].vdso_code_start);
 
 	for (i = 0; i < vdso_info[abi].vdso_pages; i++)
-		vdso_pagelist[i + 1] = pfn_to_page(pfn + i);
+		vdso_pagelist[i] = pfn_to_page(pfn + i);
 
-	vdso_info[abi].dm->pages = &vdso_pagelist[0];
-	vdso_info[abi].cm->pages = &vdso_pagelist[1];
+	vdso_info[abi].cm->pages = vdso_pagelist;
 
 	return 0;
 }
 
+static vm_fault_t vvar_fault(const struct vm_special_mapping *sm,
+			     struct vm_area_struct *vma, struct vm_fault *vmf)
+{
+	if (vmf->pgoff == 0)
+		return vmf_insert_pfn(vma, vmf->address,
+				sym_to_pfn(vdso_data));
+	return VM_FAULT_SIGBUS;
+}
+
 static int __setup_additional_pages(enum vdso_abi abi,
 				    struct mm_struct *mm,
 				    struct linux_binprm *bprm,
@@ -150,7 +153,7 @@ static int __setup_additional_pages(enum vdso_abi abi,
 	}
 
 	ret = _install_special_mapping(mm, vdso_base, PAGE_SIZE,
-				       VM_READ|VM_MAYREAD,
+				       VM_READ|VM_MAYREAD|VM_PFNMAP,
 				       vdso_info[abi].dm);
 	if (IS_ERR(ret))
 		goto up_fail;
@@ -206,6 +209,7 @@ static struct vm_special_mapping aarch32_vdso_maps[] = {
 #ifdef CONFIG_COMPAT_VDSO
 	[AA32_MAP_VVAR] = {
 		.name = "[vvar]",
+		.fault = vvar_fault,
 	},
 	[AA32_MAP_VDSO] = {
 		.name = "[vdso]",
@@ -371,6 +375,7 @@ enum aarch64_map {
 static struct vm_special_mapping aarch64_vdso_maps[] __ro_after_init = {
 	[AA64_MAP_VVAR] = {
 		.name	= "[vvar]",
+		.fault = vvar_fault,
 	},
 	[AA64_MAP_VDSO] = {
 		.name	= "[vdso]",

From 1b6867d2916bb91e94ddcc9c709e4779419fe391 Mon Sep 17 00:00:00 2001
From: Andrei Vagin <avagin@gmail.com>
Date: Wed, 24 Jun 2020 01:33:17 -0700
Subject: [PATCH 403/502] arm64/vdso: Zap vvar pages when switching to a time
 namespace

The order of vvar pages depends on whether a task belongs to the root
time namespace or not. In the root time namespace, a task doesn't have a
per-namespace page. In a non-root namespace, the VVAR page which contains
the system-wide VDSO data is replaced with a namespace specific page
that contains clock offsets.

Whenever a task changes its namespace, the VVAR page tables are cleared
and then they will be re-faulted with a corresponding layout.

A task can switch its time namespace only if its ->mm isn't shared with
another task.

Signed-off-by: Andrei Vagin <avagin@gmail.com>
Reviewed-by: Vincenzo Frascino <vincenzo.frascino@arm.com>
Reviewed-by: Dmitry Safonov <dima@arista.com>
Reviewed-by: Christian Brauner <christian.brauner@ubuntu.com>
Link: https://lore.kernel.org/r/20200624083321.144975-3-avagin@gmail.com
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 arch/arm64/kernel/vdso.c | 31 +++++++++++++++++++++++++++++++
 1 file changed, 31 insertions(+)

diff --git a/arch/arm64/kernel/vdso.c b/arch/arm64/kernel/vdso.c
index eb7798e5eb00..33ac18060bfc 100644
--- a/arch/arm64/kernel/vdso.c
+++ b/arch/arm64/kernel/vdso.c
@@ -124,6 +124,37 @@ static int __vdso_init(enum vdso_abi abi)
 	return 0;
 }
 
+#ifdef CONFIG_TIME_NS
+/*
+ * The vvar mapping contains data for a specific time namespace, so when a task
+ * changes namespace we must unmap its vvar data for the old namespace.
+ * Subsequent faults will map in data for the new namespace.
+ *
+ * For more details see timens_setup_vdso_data().
+ */
+int vdso_join_timens(struct task_struct *task, struct time_namespace *ns)
+{
+	struct mm_struct *mm = task->mm;
+	struct vm_area_struct *vma;
+
+	mmap_read_lock(mm);
+
+	for (vma = mm->mmap; vma; vma = vma->vm_next) {
+		unsigned long size = vma->vm_end - vma->vm_start;
+
+		if (vma_is_special_mapping(vma, vdso_info[VDSO_ABI_AA64].dm))
+			zap_page_range(vma, vma->vm_start, size);
+#ifdef CONFIG_COMPAT_VDSO
+		if (vma_is_special_mapping(vma, vdso_info[VDSO_ABI_AA32].dm))
+			zap_page_range(vma, vma->vm_start, size);
+#endif
+	}
+
+	mmap_read_unlock(mm);
+	return 0;
+}
+#endif
+
 static vm_fault_t vvar_fault(const struct vm_special_mapping *sm,
 			     struct vm_area_struct *vma, struct vm_fault *vmf)
 {

From 3503d56cc7233ced602e38a4c13caa64f00ab2aa Mon Sep 17 00:00:00 2001
From: Andrei Vagin <avagin@gmail.com>
Date: Wed, 24 Jun 2020 01:33:18 -0700
Subject: [PATCH 404/502] arm64/vdso: Add time namespace page

Allocate the time namespace page among VVAR pages.  Provide
__arch_get_timens_vdso_data() helper for VDSO code to get the
code-relative position of VVARs on that special page.

If a task belongs to a time namespace then the VVAR page which contains
the system wide VDSO data is replaced with a namespace specific page
which has the same layout as the VVAR page. That page has vdso_data->seq
set to 1 to enforce the slow path and vdso_data->clock_mode set to
VCLOCK_TIMENS to enforce the time namespace handling path.

The extra check in the case that vdso_data->seq is odd, e.g. a concurrent
update of the VDSO data is in progress, is not really affecting regular
tasks which are not part of a time namespace as the task is spin waiting
for the update to finish and vdso_data->seq to become even again.

If a time namespace task hits that code path, it invokes the corresponding
time getter function which retrieves the real VVAR page, reads host time
and then adds the offset for the requested clock which is stored in the
special VVAR page.

The time-namespace page isn't allocated on !CONFIG_TIME_NAMESPACE, but
vma is the same size, which simplifies criu/vdso migration between
different kernel configs.

Signed-off-by: Andrei Vagin <avagin@gmail.com>
Reviewed-by: Vincenzo Frascino <vincenzo.frascino@arm.com>
Reviewed-by: Dmitry Safonov <dima@arista.com>
Cc: Mark Rutland <mark.rutland@arm.com>
Link: https://lore.kernel.org/r/20200624083321.144975-4-avagin@gmail.com
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 arch/arm64/include/asm/vdso.h                 |  2 ++
 .../include/asm/vdso/compat_gettimeofday.h    | 12 ++++++++++++
 arch/arm64/include/asm/vdso/gettimeofday.h    |  8 ++++++++
 arch/arm64/kernel/vdso.c                      | 19 ++++++++++++++++---
 arch/arm64/kernel/vdso/vdso.lds.S             |  5 ++++-
 arch/arm64/kernel/vdso32/vdso.lds.S           |  5 ++++-
 include/vdso/datapage.h                       |  1 +
 7 files changed, 47 insertions(+), 5 deletions(-)

diff --git a/arch/arm64/include/asm/vdso.h b/arch/arm64/include/asm/vdso.h
index 07468428fd29..f99dcb94b438 100644
--- a/arch/arm64/include/asm/vdso.h
+++ b/arch/arm64/include/asm/vdso.h
@@ -12,6 +12,8 @@
  */
 #define VDSO_LBASE	0x0
 
+#define __VVAR_PAGES    2
+
 #ifndef __ASSEMBLY__
 
 #include <generated/vdso-offsets.h>
diff --git a/arch/arm64/include/asm/vdso/compat_gettimeofday.h b/arch/arm64/include/asm/vdso/compat_gettimeofday.h
index b6907ae78e53..b7c549d46d18 100644
--- a/arch/arm64/include/asm/vdso/compat_gettimeofday.h
+++ b/arch/arm64/include/asm/vdso/compat_gettimeofday.h
@@ -152,6 +152,18 @@ static __always_inline const struct vdso_data *__arch_get_vdso_data(void)
 	return ret;
 }
 
+#ifdef CONFIG_TIME_NS
+static __always_inline const struct vdso_data *__arch_get_timens_vdso_data(void)
+{
+	const struct vdso_data *ret;
+
+	/* See __arch_get_vdso_data(). */
+	asm volatile("mov %0, %1" : "=r"(ret) : "r"(_timens_data));
+
+	return ret;
+}
+#endif
+
 #endif /* !__ASSEMBLY__ */
 
 #endif /* __ASM_VDSO_GETTIMEOFDAY_H */
diff --git a/arch/arm64/include/asm/vdso/gettimeofday.h b/arch/arm64/include/asm/vdso/gettimeofday.h
index afba6ba332f8..cf39eae5eaaf 100644
--- a/arch/arm64/include/asm/vdso/gettimeofday.h
+++ b/arch/arm64/include/asm/vdso/gettimeofday.h
@@ -96,6 +96,14 @@ const struct vdso_data *__arch_get_vdso_data(void)
 	return _vdso_data;
 }
 
+#ifdef CONFIG_TIME_NS
+static __always_inline
+const struct vdso_data *__arch_get_timens_vdso_data(void)
+{
+	return _timens_data;
+}
+#endif
+
 #endif /* !__ASSEMBLY__ */
 
 #endif /* __ASM_VDSO_GETTIMEOFDAY_H */
diff --git a/arch/arm64/kernel/vdso.c b/arch/arm64/kernel/vdso.c
index 33ac18060bfc..fcb559726920 100644
--- a/arch/arm64/kernel/vdso.c
+++ b/arch/arm64/kernel/vdso.c
@@ -40,6 +40,12 @@ enum vdso_abi {
 #endif /* CONFIG_COMPAT_VDSO */
 };
 
+enum vvar_pages {
+	VVAR_DATA_PAGE_OFFSET,
+	VVAR_TIMENS_PAGE_OFFSET,
+	VVAR_NR_PAGES,
+};
+
 struct vdso_abi_info {
 	const char *name;
 	const char *vdso_code_start;
@@ -125,6 +131,11 @@ static int __vdso_init(enum vdso_abi abi)
 }
 
 #ifdef CONFIG_TIME_NS
+struct vdso_data *arch_get_vdso_data(void *vvar_page)
+{
+	return (struct vdso_data *)(vvar_page);
+}
+
 /*
  * The vvar mapping contains data for a specific time namespace, so when a task
  * changes namespace we must unmap its vvar data for the old namespace.
@@ -173,9 +184,11 @@ static int __setup_additional_pages(enum vdso_abi abi,
 	unsigned long gp_flags = 0;
 	void *ret;
 
+	BUILD_BUG_ON(VVAR_NR_PAGES != __VVAR_PAGES);
+
 	vdso_text_len = vdso_info[abi].vdso_pages << PAGE_SHIFT;
 	/* Be sure to map the data page */
-	vdso_mapping_len = vdso_text_len + PAGE_SIZE;
+	vdso_mapping_len = vdso_text_len + VVAR_NR_PAGES * PAGE_SIZE;
 
 	vdso_base = get_unmapped_area(NULL, 0, vdso_mapping_len, 0, 0);
 	if (IS_ERR_VALUE(vdso_base)) {
@@ -183,7 +196,7 @@ static int __setup_additional_pages(enum vdso_abi abi,
 		goto up_fail;
 	}
 
-	ret = _install_special_mapping(mm, vdso_base, PAGE_SIZE,
+	ret = _install_special_mapping(mm, vdso_base, VVAR_NR_PAGES * PAGE_SIZE,
 				       VM_READ|VM_MAYREAD|VM_PFNMAP,
 				       vdso_info[abi].dm);
 	if (IS_ERR(ret))
@@ -192,7 +205,7 @@ static int __setup_additional_pages(enum vdso_abi abi,
 	if (IS_ENABLED(CONFIG_ARM64_BTI_KERNEL) && system_supports_bti())
 		gp_flags = VM_ARM64_BTI;
 
-	vdso_base += PAGE_SIZE;
+	vdso_base += VVAR_NR_PAGES * PAGE_SIZE;
 	mm->context.vdso = (void *)vdso_base;
 	ret = _install_special_mapping(mm, vdso_base, vdso_text_len,
 				       VM_READ|VM_EXEC|gp_flags|
diff --git a/arch/arm64/kernel/vdso/vdso.lds.S b/arch/arm64/kernel/vdso/vdso.lds.S
index 7ad2d3a0cd48..d808ad31e01f 100644
--- a/arch/arm64/kernel/vdso/vdso.lds.S
+++ b/arch/arm64/kernel/vdso/vdso.lds.S
@@ -17,7 +17,10 @@ OUTPUT_ARCH(aarch64)
 
 SECTIONS
 {
-	PROVIDE(_vdso_data = . - PAGE_SIZE);
+	PROVIDE(_vdso_data = . - __VVAR_PAGES * PAGE_SIZE);
+#ifdef CONFIG_TIME_NS
+	PROVIDE(_timens_data = _vdso_data + PAGE_SIZE);
+#endif
 	. = VDSO_LBASE + SIZEOF_HEADERS;
 
 	.hash		: { *(.hash) }			:text
diff --git a/arch/arm64/kernel/vdso32/vdso.lds.S b/arch/arm64/kernel/vdso32/vdso.lds.S
index 337d03522048..3348ce5ea306 100644
--- a/arch/arm64/kernel/vdso32/vdso.lds.S
+++ b/arch/arm64/kernel/vdso32/vdso.lds.S
@@ -17,7 +17,10 @@ OUTPUT_ARCH(arm)
 
 SECTIONS
 {
-	PROVIDE_HIDDEN(_vdso_data = . - PAGE_SIZE);
+	PROVIDE_HIDDEN(_vdso_data = . - __VVAR_PAGES * PAGE_SIZE);
+#ifdef CONFIG_TIME_NS
+	PROVIDE_HIDDEN(_timens_data = _vdso_data + PAGE_SIZE);
+#endif
 	. = VDSO_LBASE + SIZEOF_HEADERS;
 
 	.hash		: { *(.hash) }			:text
diff --git a/include/vdso/datapage.h b/include/vdso/datapage.h
index 7955c56d6b3c..ee810cae4e1e 100644
--- a/include/vdso/datapage.h
+++ b/include/vdso/datapage.h
@@ -109,6 +109,7 @@ struct vdso_data {
  * relocation, and this is what we need.
  */
 extern struct vdso_data _vdso_data[CS_BASES] __attribute__((visibility("hidden")));
+extern struct vdso_data _timens_data[CS_BASES] __attribute__((visibility("hidden")));
 
 /*
  * The generic vDSO implementation requires that gettimeofday.h

From ee3cda8e46060b021087b6ef451e1cd9fa648af6 Mon Sep 17 00:00:00 2001
From: Andrei Vagin <avagin@gmail.com>
Date: Wed, 24 Jun 2020 01:33:19 -0700
Subject: [PATCH 405/502] arm64/vdso: Handle faults on timens page

If a task belongs to a time namespace then the VVAR page which contains
the system wide VDSO data is replaced with a namespace specific page
which has the same layout as the VVAR page.

Signed-off-by: Andrei Vagin <avagin@gmail.com>
Reviewed-by: Vincenzo Frascino <vincenzo.frascino@arm.com>
Reviewed-by: Dmitry Safonov <dima@arista.com>
Link: https://lore.kernel.org/r/20200624083321.144975-5-avagin@gmail.com
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 arch/arm64/kernel/vdso.c | 56 +++++++++++++++++++++++++++++++++++++---
 1 file changed, 52 insertions(+), 4 deletions(-)

diff --git a/arch/arm64/kernel/vdso.c b/arch/arm64/kernel/vdso.c
index fcb559726920..c11ee18e3e79 100644
--- a/arch/arm64/kernel/vdso.c
+++ b/arch/arm64/kernel/vdso.c
@@ -18,6 +18,7 @@
 #include <linux/sched.h>
 #include <linux/signal.h>
 #include <linux/slab.h>
+#include <linux/time_namespace.h>
 #include <linux/timekeeper_internal.h>
 #include <linux/vmalloc.h>
 #include <vdso/datapage.h>
@@ -164,15 +165,62 @@ int vdso_join_timens(struct task_struct *task, struct time_namespace *ns)
 	mmap_read_unlock(mm);
 	return 0;
 }
+
+static struct page *find_timens_vvar_page(struct vm_area_struct *vma)
+{
+	if (likely(vma->vm_mm == current->mm))
+		return current->nsproxy->time_ns->vvar_page;
+
+	/*
+	 * VM_PFNMAP | VM_IO protect .fault() handler from being called
+	 * through interfaces like /proc/$pid/mem or
+	 * process_vm_{readv,writev}() as long as there's no .access()
+	 * in special_mapping_vmops.
+	 * For more details check_vma_flags() and __access_remote_vm()
+	 */
+	WARN(1, "vvar_page accessed remotely");
+
+	return NULL;
+}
+#else
+static struct page *find_timens_vvar_page(struct vm_area_struct *vma)
+{
+	return NULL;
+}
 #endif
 
 static vm_fault_t vvar_fault(const struct vm_special_mapping *sm,
 			     struct vm_area_struct *vma, struct vm_fault *vmf)
 {
-	if (vmf->pgoff == 0)
-		return vmf_insert_pfn(vma, vmf->address,
-				sym_to_pfn(vdso_data));
-	return VM_FAULT_SIGBUS;
+	struct page *timens_page = find_timens_vvar_page(vma);
+	unsigned long pfn;
+
+	switch (vmf->pgoff) {
+	case VVAR_DATA_PAGE_OFFSET:
+		if (timens_page)
+			pfn = page_to_pfn(timens_page);
+		else
+			pfn = sym_to_pfn(vdso_data);
+		break;
+#ifdef CONFIG_TIME_NS
+	case VVAR_TIMENS_PAGE_OFFSET:
+		/*
+		 * If a task belongs to a time namespace then a namespace
+		 * specific VVAR is mapped with the VVAR_DATA_PAGE_OFFSET and
+		 * the real VVAR page is mapped with the VVAR_TIMENS_PAGE_OFFSET
+		 * offset.
+		 * See also the comment near timens_setup_vdso_data().
+		 */
+		if (!timens_page)
+			return VM_FAULT_SIGBUS;
+		pfn = sym_to_pfn(vdso_data);
+		break;
+#endif /* CONFIG_TIME_NS */
+	default:
+		return VM_FAULT_SIGBUS;
+	}
+
+	return vmf_insert_pfn(vma, vmf->address, pfn);
 }
 
 static int __setup_additional_pages(enum vdso_abi abi,

From bcf996434240c611f0fdab2c18cd75dd59cfa3c2 Mon Sep 17 00:00:00 2001
From: Andrei Vagin <avagin@gmail.com>
Date: Wed, 24 Jun 2020 01:33:20 -0700
Subject: [PATCH 406/502] arm64/vdso: Restrict splitting VVAR VMA

Forbid splitting VVAR VMA resulting in a stricter ABI and reducing the
amount of corner-cases to consider while working further on VDSO time
namespace support.

As the offset from timens to VVAR page is computed compile-time, the pages
in VVAR should stay together and not being partically mremap()'ed.

Signed-off-by: Andrei Vagin <avagin@gmail.com>
Reviewed-by: Vincenzo Frascino <vincenzo.frascino@arm.com>
Reviewed-by: Dmitry Safonov <dima@arista.com>
Link: https://lore.kernel.org/r/20200624083321.144975-6-avagin@gmail.com
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 arch/arm64/kernel/vdso.c | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/arch/arm64/kernel/vdso.c b/arch/arm64/kernel/vdso.c
index c11ee18e3e79..d4202a32abc9 100644
--- a/arch/arm64/kernel/vdso.c
+++ b/arch/arm64/kernel/vdso.c
@@ -223,6 +223,17 @@ static vm_fault_t vvar_fault(const struct vm_special_mapping *sm,
 	return vmf_insert_pfn(vma, vmf->address, pfn);
 }
 
+static int vvar_mremap(const struct vm_special_mapping *sm,
+		       struct vm_area_struct *new_vma)
+{
+	unsigned long new_size = new_vma->vm_end - new_vma->vm_start;
+
+	if (new_size != VVAR_NR_PAGES * PAGE_SIZE)
+		return -EINVAL;
+
+	return 0;
+}
+
 static int __setup_additional_pages(enum vdso_abi abi,
 				    struct mm_struct *mm,
 				    struct linux_binprm *bprm,
@@ -302,6 +313,7 @@ static struct vm_special_mapping aarch32_vdso_maps[] = {
 	[AA32_MAP_VVAR] = {
 		.name = "[vvar]",
 		.fault = vvar_fault,
+		.mremap = vvar_mremap,
 	},
 	[AA32_MAP_VDSO] = {
 		.name = "[vdso]",
@@ -468,6 +480,7 @@ static struct vm_special_mapping aarch64_vdso_maps[] __ro_after_init = {
 	[AA64_MAP_VVAR] = {
 		.name	= "[vvar]",
 		.fault = vvar_fault,
+		.mremap = vvar_mremap,
 	},
 	[AA64_MAP_VDSO] = {
 		.name	= "[vdso]",

From 9614cc576d76a7449cd51b60ef81fd0ce19ee694 Mon Sep 17 00:00:00 2001
From: Andrei Vagin <avagin@gmail.com>
Date: Wed, 24 Jun 2020 01:33:21 -0700
Subject: [PATCH 407/502] arm64: enable time namespace support

CONFIG_TIME_NS is dependes on GENERIC_VDSO_TIME_NS.

Signed-off-by: Andrei Vagin <avagin@gmail.com>
Reviewed-by: Vincenzo Frascino <vincenzo.frascino@arm.com>
Reviewed-by: Dmitry Safonov <dima@arista.com>
Link: https://lore.kernel.org/r/20200624083321.144975-7-avagin@gmail.com
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 arch/arm64/Kconfig | 1 +
 1 file changed, 1 insertion(+)

diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index 66dc41fd49f2..87255f02ec5b 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -118,6 +118,7 @@ config ARM64
 	select GENERIC_STRNLEN_USER
 	select GENERIC_TIME_VSYSCALL
 	select GENERIC_GETTIMEOFDAY
+	select GENERIC_VDSO_TIME_NS
 	select HANDLE_DOMAIN_IRQ
 	select HARDIRQS_SW_RESEND
 	select HAVE_PCI

From b36200f543ff07a1cb346aa582349141df2c8068 Mon Sep 17 00:00:00 2001
From: Dmitry Vyukov <dvyukov@google.com>
Date: Sat, 11 Jul 2020 11:31:11 +0200
Subject: [PATCH 408/502] io_uring: fix sq array offset calculation

rings_size() sets sq_offset to the total size of the rings (the returned
value which is used for memory allocation). This is wrong: sq array should
be located within the rings, not after them. Set sq_offset to where it
should be.

Fixes: 75b28affdd6a ("io_uring: allocate the two rings together")
Signed-off-by: Dmitry Vyukov <dvyukov@google.com>
Acked-by: Hristo Venev <hristo@venev.name>
Cc: io-uring@vger.kernel.org
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index ff3851d40df4..ca932fb3c67d 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -7416,6 +7416,9 @@ static unsigned long rings_size(unsigned sq_entries, unsigned cq_entries,
 		return SIZE_MAX;
 #endif
 
+	if (sq_offset)
+		*sq_offset = off;
+
 	sq_array_size = array_size(sizeof(u32), sq_entries);
 	if (sq_array_size == SIZE_MAX)
 		return SIZE_MAX;
@@ -7423,9 +7426,6 @@ static unsigned long rings_size(unsigned sq_entries, unsigned cq_entries,
 	if (check_add_overflow(off, sq_array_size, &off))
 		return SIZE_MAX;
 
-	if (sq_offset)
-		*sq_offset = off;
-
 	return off;
 }
 

From 270a5940700bb6cf9abf36ea10cf1fa0d453aa7a Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Sun, 12 Jul 2020 20:41:04 +0300
Subject: [PATCH 409/502] io_uring: rename sr->msg into umsg

Every second field in send/recv is called msg, make it a bit more
understandable by renaming ->msg, which is a user provided ptr,
to ->umsg.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index ca932fb3c67d..f17f098c403a 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -414,7 +414,7 @@ struct io_connect {
 struct io_sr_msg {
 	struct file			*file;
 	union {
-		struct user_msghdr __user *msg;
+		struct user_msghdr __user *umsg;
 		void __user		*buf;
 	};
 	int				msg_flags;
@@ -3903,7 +3903,7 @@ static int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 		return -EINVAL;
 
 	sr->msg_flags = READ_ONCE(sqe->msg_flags);
-	sr->msg = u64_to_user_ptr(READ_ONCE(sqe->addr));
+	sr->umsg = u64_to_user_ptr(READ_ONCE(sqe->addr));
 	sr->len = READ_ONCE(sqe->len);
 
 #ifdef CONFIG_COMPAT
@@ -3919,7 +3919,7 @@ static int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 
 	io->msg.msg.msg_name = &io->msg.addr;
 	io->msg.iov = io->msg.fast_iov;
-	ret = sendmsg_copy_msghdr(&io->msg.msg, sr->msg, sr->msg_flags,
+	ret = sendmsg_copy_msghdr(&io->msg.msg, sr->umsg, sr->msg_flags,
 					&io->msg.iov);
 	if (!ret)
 		req->flags |= REQ_F_NEED_CLEANUP;
@@ -3952,7 +3952,7 @@ static int io_sendmsg(struct io_kiocb *req, bool force_nonblock,
 			kmsg->msg.msg_name = &io.msg.addr;
 
 			io.msg.iov = io.msg.fast_iov;
-			ret = sendmsg_copy_msghdr(&io.msg.msg, sr->msg,
+			ret = sendmsg_copy_msghdr(&io.msg.msg, sr->umsg,
 					sr->msg_flags, &io.msg.iov);
 			if (ret)
 				return ret;
@@ -4030,8 +4030,8 @@ static int __io_recvmsg_copy_hdr(struct io_kiocb *req, struct io_async_ctx *io)
 	size_t iov_len;
 	int ret;
 
-	ret = __copy_msghdr_from_user(&io->msg.msg, sr->msg, &io->msg.uaddr,
-					&uiov, &iov_len);
+	ret = __copy_msghdr_from_user(&io->msg.msg, sr->umsg,
+					&io->msg.uaddr, &uiov, &iov_len);
 	if (ret)
 		return ret;
 
@@ -4065,7 +4065,7 @@ static int __io_compat_recvmsg_copy_hdr(struct io_kiocb *req,
 	compat_size_t len;
 	int ret;
 
-	msg_compat = (struct compat_msghdr __user *) sr->msg;
+	msg_compat = (struct compat_msghdr __user *) sr->umsg;
 	ret = __get_compat_msghdr(&io->msg.msg, msg_compat, &io->msg.uaddr,
 					&ptr, &len);
 	if (ret)
@@ -4142,7 +4142,7 @@ static int io_recvmsg_prep(struct io_kiocb *req,
 		return -EINVAL;
 
 	sr->msg_flags = READ_ONCE(sqe->msg_flags);
-	sr->msg = u64_to_user_ptr(READ_ONCE(sqe->addr));
+	sr->umsg = u64_to_user_ptr(READ_ONCE(sqe->addr));
 	sr->len = READ_ONCE(sqe->len);
 	sr->bgid = READ_ONCE(sqe->buf_group);
 
@@ -4207,7 +4207,7 @@ static int io_recvmsg(struct io_kiocb *req, bool force_nonblock,
 		else if (force_nonblock)
 			flags |= MSG_DONTWAIT;
 
-		ret = __sys_recvmsg_sock(sock, &kmsg->msg, req->sr_msg.msg,
+		ret = __sys_recvmsg_sock(sock, &kmsg->msg, req->sr_msg.umsg,
 						kmsg->uaddr, flags);
 		if (force_nonblock && ret == -EAGAIN) {
 			ret = io_setup_async_msg(req, kmsg);

From 1400e69705baf98d1c9cb73b592a3a68aab1d852 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Sun, 12 Jul 2020 20:41:05 +0300
Subject: [PATCH 410/502] io_uring: use more specific type in rcv/snd msg cp

send/recv msghdr initialisation works with struct io_async_msghdr, but
pulls the whole struct io_async_ctx for no reason. That complicates it
with composite accessing, e.g. io->msg.

Use and pass the most specific type, which is struct io_async_msghdr.
It is the larget field in union io_async_ctx and doesn't save stack
space, but looks clearer.
The most of the changes are replacing "io->msg." with "iomsg->"

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 63 +++++++++++++++++++++++++--------------------------
 1 file changed, 31 insertions(+), 32 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index f17f098c403a..8acbaddaebb7 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -3935,7 +3935,7 @@ static int io_sendmsg(struct io_kiocb *req, bool force_nonblock,
 
 	sock = sock_from_file(req->file, &ret);
 	if (sock) {
-		struct io_async_ctx io;
+		struct io_async_msghdr iomsg;
 		unsigned flags;
 
 		if (req->io) {
@@ -3948,14 +3948,13 @@ static int io_sendmsg(struct io_kiocb *req, bool force_nonblock,
 		} else {
 			struct io_sr_msg *sr = &req->sr_msg;
 
-			kmsg = &io.msg;
-			kmsg->msg.msg_name = &io.msg.addr;
-
-			io.msg.iov = io.msg.fast_iov;
-			ret = sendmsg_copy_msghdr(&io.msg.msg, sr->umsg,
-					sr->msg_flags, &io.msg.iov);
+			iomsg.msg.msg_name = &iomsg.addr;
+			iomsg.iov = iomsg.fast_iov;
+			ret = sendmsg_copy_msghdr(&iomsg.msg, sr->umsg,
+					sr->msg_flags, &iomsg.iov);
 			if (ret)
 				return ret;
+			kmsg = &iomsg;
 		}
 
 		flags = req->sr_msg.msg_flags;
@@ -4023,30 +4022,31 @@ static int io_send(struct io_kiocb *req, bool force_nonblock,
 	return 0;
 }
 
-static int __io_recvmsg_copy_hdr(struct io_kiocb *req, struct io_async_ctx *io)
+static int __io_recvmsg_copy_hdr(struct io_kiocb *req,
+				 struct io_async_msghdr *iomsg)
 {
 	struct io_sr_msg *sr = &req->sr_msg;
 	struct iovec __user *uiov;
 	size_t iov_len;
 	int ret;
 
-	ret = __copy_msghdr_from_user(&io->msg.msg, sr->umsg,
-					&io->msg.uaddr, &uiov, &iov_len);
+	ret = __copy_msghdr_from_user(&iomsg->msg, sr->umsg,
+					&iomsg->uaddr, &uiov, &iov_len);
 	if (ret)
 		return ret;
 
 	if (req->flags & REQ_F_BUFFER_SELECT) {
 		if (iov_len > 1)
 			return -EINVAL;
-		if (copy_from_user(io->msg.iov, uiov, sizeof(*uiov)))
+		if (copy_from_user(iomsg->iov, uiov, sizeof(*uiov)))
 			return -EFAULT;
-		sr->len = io->msg.iov[0].iov_len;
-		iov_iter_init(&io->msg.msg.msg_iter, READ, io->msg.iov, 1,
+		sr->len = iomsg->iov[0].iov_len;
+		iov_iter_init(&iomsg->msg.msg_iter, READ, iomsg->iov, 1,
 				sr->len);
-		io->msg.iov = NULL;
+		iomsg->iov = NULL;
 	} else {
 		ret = import_iovec(READ, uiov, iov_len, UIO_FASTIOV,
-					&io->msg.iov, &io->msg.msg.msg_iter);
+					&iomsg->iov, &iomsg->msg.msg_iter);
 		if (ret > 0)
 			ret = 0;
 	}
@@ -4056,7 +4056,7 @@ static int __io_recvmsg_copy_hdr(struct io_kiocb *req, struct io_async_ctx *io)
 
 #ifdef CONFIG_COMPAT
 static int __io_compat_recvmsg_copy_hdr(struct io_kiocb *req,
-					struct io_async_ctx *io)
+					struct io_async_msghdr *iomsg)
 {
 	struct compat_msghdr __user *msg_compat;
 	struct io_sr_msg *sr = &req->sr_msg;
@@ -4066,7 +4066,7 @@ static int __io_compat_recvmsg_copy_hdr(struct io_kiocb *req,
 	int ret;
 
 	msg_compat = (struct compat_msghdr __user *) sr->umsg;
-	ret = __get_compat_msghdr(&io->msg.msg, msg_compat, &io->msg.uaddr,
+	ret = __get_compat_msghdr(&iomsg->msg, msg_compat, &iomsg->uaddr,
 					&ptr, &len);
 	if (ret)
 		return ret;
@@ -4083,12 +4083,12 @@ static int __io_compat_recvmsg_copy_hdr(struct io_kiocb *req,
 			return -EFAULT;
 		if (clen < 0)
 			return -EINVAL;
-		sr->len = io->msg.iov[0].iov_len;
-		io->msg.iov = NULL;
+		sr->len = iomsg->iov[0].iov_len;
+		iomsg->iov = NULL;
 	} else {
 		ret = compat_import_iovec(READ, uiov, len, UIO_FASTIOV,
-						&io->msg.iov,
-						&io->msg.msg.msg_iter);
+						&iomsg->iov,
+						&iomsg->msg.msg_iter);
 		if (ret < 0)
 			return ret;
 	}
@@ -4097,17 +4097,18 @@ static int __io_compat_recvmsg_copy_hdr(struct io_kiocb *req,
 }
 #endif
 
-static int io_recvmsg_copy_hdr(struct io_kiocb *req, struct io_async_ctx *io)
+static int io_recvmsg_copy_hdr(struct io_kiocb *req,
+			       struct io_async_msghdr *iomsg)
 {
-	io->msg.msg.msg_name = &io->msg.addr;
-	io->msg.iov = io->msg.fast_iov;
+	iomsg->msg.msg_name = &iomsg->addr;
+	iomsg->iov = iomsg->fast_iov;
 
 #ifdef CONFIG_COMPAT
 	if (req->ctx->compat)
-		return __io_compat_recvmsg_copy_hdr(req, io);
+		return __io_compat_recvmsg_copy_hdr(req, iomsg);
 #endif
 
-	return __io_recvmsg_copy_hdr(req, io);
+	return __io_recvmsg_copy_hdr(req, iomsg);
 }
 
 static struct io_buffer *io_recv_buffer_select(struct io_kiocb *req,
@@ -4157,7 +4158,7 @@ static int io_recvmsg_prep(struct io_kiocb *req,
 	if (req->flags & REQ_F_NEED_CLEANUP)
 		return 0;
 
-	ret = io_recvmsg_copy_hdr(req, io);
+	ret = io_recvmsg_copy_hdr(req, &io->msg);
 	if (!ret)
 		req->flags |= REQ_F_NEED_CLEANUP;
 	return ret;
@@ -4173,7 +4174,7 @@ static int io_recvmsg(struct io_kiocb *req, bool force_nonblock,
 	sock = sock_from_file(req->file, &ret);
 	if (sock) {
 		struct io_buffer *kbuf;
-		struct io_async_ctx io;
+		struct io_async_msghdr iomsg;
 		unsigned flags;
 
 		if (req->io) {
@@ -4184,12 +4185,10 @@ static int io_recvmsg(struct io_kiocb *req, bool force_nonblock,
 				kmsg->iov = kmsg->fast_iov;
 			kmsg->msg.msg_iter.iov = kmsg->iov;
 		} else {
-			kmsg = &io.msg;
-			kmsg->msg.msg_name = &io.msg.addr;
-
-			ret = io_recvmsg_copy_hdr(req, &io);
+			ret = io_recvmsg_copy_hdr(req, &iomsg);
 			if (ret)
 				return ret;
+			kmsg = &iomsg;
 		}
 
 		kbuf = io_recv_buffer_select(req, &cflags, !force_nonblock);

From 2ae523ed07f14391d685651f671a7858fe8c368a Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Sun, 12 Jul 2020 20:41:06 +0300
Subject: [PATCH 411/502] io_uring: extract io_sendmsg_copy_hdr()

Don't repeat send msg initialisation code, it's error prone.
Extract and use a helper function.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 21 +++++++++++----------
 1 file changed, 11 insertions(+), 10 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 8acbaddaebb7..a198466544e7 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -3893,6 +3893,15 @@ static int io_setup_async_msg(struct io_kiocb *req,
 	return -EAGAIN;
 }
 
+static int io_sendmsg_copy_hdr(struct io_kiocb *req,
+			       struct io_async_msghdr *iomsg)
+{
+	iomsg->iov = iomsg->fast_iov;
+	iomsg->msg.msg_name = &iomsg->addr;
+	return sendmsg_copy_msghdr(&iomsg->msg, req->sr_msg.umsg,
+				   req->sr_msg.msg_flags, &iomsg->iov);
+}
+
 static int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 {
 	struct io_sr_msg *sr = &req->sr_msg;
@@ -3917,10 +3926,7 @@ static int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 	if (req->flags & REQ_F_NEED_CLEANUP)
 		return 0;
 
-	io->msg.msg.msg_name = &io->msg.addr;
-	io->msg.iov = io->msg.fast_iov;
-	ret = sendmsg_copy_msghdr(&io->msg.msg, sr->umsg, sr->msg_flags,
-					&io->msg.iov);
+	ret = io_sendmsg_copy_hdr(req, &io->msg);
 	if (!ret)
 		req->flags |= REQ_F_NEED_CLEANUP;
 	return ret;
@@ -3946,12 +3952,7 @@ static int io_sendmsg(struct io_kiocb *req, bool force_nonblock,
 				kmsg->iov = kmsg->fast_iov;
 			kmsg->msg.msg_iter.iov = kmsg->iov;
 		} else {
-			struct io_sr_msg *sr = &req->sr_msg;
-
-			iomsg.msg.msg_name = &iomsg.addr;
-			iomsg.iov = iomsg.fast_iov;
-			ret = sendmsg_copy_msghdr(&iomsg.msg, sr->umsg,
-					sr->msg_flags, &iomsg.iov);
+			ret = io_sendmsg_copy_hdr(req, &iomsg);
 			if (ret)
 				return ret;
 			kmsg = &iomsg;

From e73751225bae1e9b67e957afb273366fbb6ca136 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Sun, 12 Jul 2020 20:42:04 +0300
Subject: [PATCH 412/502] io_uring: replace rw->task_work with rq->task_work

io_kiocb::task_work was de-unionised, and is not planned to be shared
back, because it's too useful and commonly used. Hence, instead of
keeping a separate task_work in struct io_async_rw just reuse
req->task_work.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 31 ++++---------------------------
 1 file changed, 4 insertions(+), 27 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index a198466544e7..ddff3abff363 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -505,7 +505,6 @@ struct io_async_rw {
 	ssize_t				nr_segs;
 	ssize_t				size;
 	struct wait_page_queue		wpq;
-	struct callback_head		task_work;
 };
 
 struct io_async_ctx {
@@ -2901,33 +2900,11 @@ static int io_read_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe,
 	return 0;
 }
 
-static void io_async_buf_cancel(struct callback_head *cb)
-{
-	struct io_async_rw *rw;
-	struct io_kiocb *req;
-
-	rw = container_of(cb, struct io_async_rw, task_work);
-	req = rw->wpq.wait.private;
-	__io_req_task_cancel(req, -ECANCELED);
-}
-
-static void io_async_buf_retry(struct callback_head *cb)
-{
-	struct io_async_rw *rw;
-	struct io_kiocb *req;
-
-	rw = container_of(cb, struct io_async_rw, task_work);
-	req = rw->wpq.wait.private;
-
-	__io_req_task_submit(req);
-}
-
 static int io_async_buf_func(struct wait_queue_entry *wait, unsigned mode,
 			     int sync, void *arg)
 {
 	struct wait_page_queue *wpq;
 	struct io_kiocb *req = wait->private;
-	struct io_async_rw *rw = &req->io->rw;
 	struct wait_page_key *key = arg;
 	int ret;
 
@@ -2939,17 +2916,17 @@ static int io_async_buf_func(struct wait_queue_entry *wait, unsigned mode,
 
 	list_del_init(&wait->entry);
 
-	init_task_work(&rw->task_work, io_async_buf_retry);
+	init_task_work(&req->task_work, io_req_task_submit);
 	/* submit ref gets dropped, acquire a new one */
 	refcount_inc(&req->refs);
-	ret = io_req_task_work_add(req, &rw->task_work);
+	ret = io_req_task_work_add(req, &req->task_work);
 	if (unlikely(ret)) {
 		struct task_struct *tsk;
 
 		/* queue just for cancelation */
-		init_task_work(&rw->task_work, io_async_buf_cancel);
+		init_task_work(&req->task_work, io_req_task_cancel);
 		tsk = io_wq_get_task(req->ctx->io_wq);
-		task_work_add(tsk, &rw->task_work, 0);
+		task_work_add(tsk, &req->task_work, 0);
 		wake_up_process(tsk);
 	}
 	return 1;

From b64e3444d4e1c71fe148a4f4535395b1fdd73200 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Mon, 13 Jul 2020 22:59:18 +0300
Subject: [PATCH 413/502] io_uring: simplify io_req_map_rw()

Don't deref req->io->rw every time, but put it in a local variable. This
looks prettier, generates less instructions, and doesn't break alias
analysis.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 16 +++++++++-------
 1 file changed, 9 insertions(+), 7 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index ddff3abff363..3b8465dd0214 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -2828,15 +2828,17 @@ static void io_req_map_rw(struct io_kiocb *req, ssize_t io_size,
 			  struct iovec *iovec, struct iovec *fast_iov,
 			  struct iov_iter *iter)
 {
-	req->io->rw.nr_segs = iter->nr_segs;
-	req->io->rw.size = io_size;
-	req->io->rw.iov = iovec;
-	if (!req->io->rw.iov) {
-		req->io->rw.iov = req->io->rw.fast_iov;
-		if (req->io->rw.iov != fast_iov)
-			memcpy(req->io->rw.iov, fast_iov,
+	struct io_async_rw *rw = &req->io->rw;
+
+	rw->nr_segs = iter->nr_segs;
+	rw->size = io_size;
+	if (!iovec) {
+		rw->iov = rw->fast_iov;
+		if (rw->iov != fast_iov)
+			memcpy(rw->iov, fast_iov,
 			       sizeof(struct iovec) * iter->nr_segs);
 	} else {
+		rw->iov = iovec;
 		req->flags |= REQ_F_NEED_CLEANUP;
 	}
 }

From c3e330a493740a2a8312dcb7b1cffceaec7f619a Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Mon, 13 Jul 2020 22:59:19 +0300
Subject: [PATCH 414/502] io_uring: add a helper for async rw iovec prep

Preparing reads/writes for async is a bit tricky. Extract a helper to
not repeat it twice.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 46 ++++++++++++++++++++--------------------------
 1 file changed, 20 insertions(+), 26 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 3b8465dd0214..31466bcd833e 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -2872,11 +2872,27 @@ static int io_setup_async_rw(struct io_kiocb *req, ssize_t io_size,
 	return 0;
 }
 
+static inline int io_rw_prep_async(struct io_kiocb *req, int rw,
+				   bool force_nonblock)
+{
+	struct io_async_ctx *io = req->io;
+	struct iov_iter iter;
+	ssize_t ret;
+
+	io->rw.iov = io->rw.fast_iov;
+	req->io = NULL;
+	ret = io_import_iovec(rw, req, &io->rw.iov, &iter, !force_nonblock);
+	req->io = io;
+	if (unlikely(ret < 0))
+		return ret;
+
+	io_req_map_rw(req, ret, io->rw.iov, io->rw.fast_iov, &iter);
+	return 0;
+}
+
 static int io_read_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe,
 			bool force_nonblock)
 {
-	struct io_async_ctx *io;
-	struct iov_iter iter;
 	ssize_t ret;
 
 	ret = io_prep_rw(req, sqe, force_nonblock);
@@ -2889,17 +2905,7 @@ static int io_read_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe,
 	/* either don't need iovec imported or already have it */
 	if (!req->io || req->flags & REQ_F_NEED_CLEANUP)
 		return 0;
-
-	io = req->io;
-	io->rw.iov = io->rw.fast_iov;
-	req->io = NULL;
-	ret = io_import_iovec(READ, req, &io->rw.iov, &iter, !force_nonblock);
-	req->io = io;
-	if (ret < 0)
-		return ret;
-
-	io_req_map_rw(req, ret, io->rw.iov, io->rw.fast_iov, &iter);
-	return 0;
+	return io_rw_prep_async(req, READ, force_nonblock);
 }
 
 static int io_async_buf_func(struct wait_queue_entry *wait, unsigned mode,
@@ -3043,8 +3049,6 @@ out_free:
 static int io_write_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe,
 			 bool force_nonblock)
 {
-	struct io_async_ctx *io;
-	struct iov_iter iter;
 	ssize_t ret;
 
 	ret = io_prep_rw(req, sqe, force_nonblock);
@@ -3059,17 +3063,7 @@ static int io_write_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe,
 	/* either don't need iovec imported or already have it */
 	if (!req->io || req->flags & REQ_F_NEED_CLEANUP)
 		return 0;
-
-	io = req->io;
-	io->rw.iov = io->rw.fast_iov;
-	req->io = NULL;
-	ret = io_import_iovec(WRITE, req, &io->rw.iov, &iter, !force_nonblock);
-	req->io = io;
-	if (ret < 0)
-		return ret;
-
-	io_req_map_rw(req, ret, io->rw.iov, io->rw.fast_iov, &iter);
-	return 0;
+	return io_rw_prep_async(req, WRITE, force_nonblock);
 }
 
 static int io_write(struct io_kiocb *req, bool force_nonblock,

From 252917c30f551e8e4377faac81d7fcf8e9629df1 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Mon, 13 Jul 2020 22:59:20 +0300
Subject: [PATCH 415/502] io_uring: follow **iovec idiom in io_import_iovec

As for import_iovec(), return !=NULL iovec from io_import_iovec() only
when it should be freed. That includes returning NULL when iovec is
already in req->io, because it should be deallocated by other means,
e.g. inside op handler. After io_setup_async_rw() local iovec to ->io,
just mark it NULL, to follow the idea in io_{read,write} as well.

That's easier to follow, and especially useful if we want to reuse
per-op space for completion data.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
[axboe: only call kfree() on non-NULL pointer]
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 31466bcd833e..64ae5b681c62 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -2740,10 +2740,8 @@ static ssize_t io_import_iovec(int rw, struct io_kiocb *req,
 	if (req->io) {
 		struct io_async_rw *iorw = &req->io->rw;
 
-		*iovec = iorw->iov;
-		iov_iter_init(iter, rw, *iovec, iorw->nr_segs, iorw->size);
-		if (iorw->iov == iorw->fast_iov)
-			*iovec = NULL;
+		iov_iter_init(iter, rw, iorw->iov, iorw->nr_segs, iorw->size);
+		*iovec = NULL;
 		return iorw->size;
 	}
 
@@ -3026,6 +3024,8 @@ copy_iov:
 						inline_vecs, &iter);
 			if (ret)
 				goto out_free;
+			/* it's copied and will be cleaned with ->io */
+			iovec = NULL;
 			/* if we can retry, do so with the callbacks armed */
 			if (io_rw_should_retry(req)) {
 				ret2 = io_iter_do_read(req, &iter);
@@ -3041,7 +3041,7 @@ copy_iov:
 		}
 	}
 out_free:
-	if (!(req->flags & REQ_F_NEED_CLEANUP))
+	if (iovec)
 		kfree(iovec);
 	return ret;
 }
@@ -3143,11 +3143,13 @@ copy_iov:
 						inline_vecs, &iter);
 			if (ret)
 				goto out_free;
+			/* it's copied and will be cleaned with ->io */
+			iovec = NULL;
 			return -EAGAIN;
 		}
 	}
 out_free:
-	if (!(req->flags & REQ_F_NEED_CLEANUP))
+	if (iovec)
 		kfree(iovec);
 	return ret;
 }

From 3ca405ebfc1c3445b049dd25ca3338cbc99837d1 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Mon, 13 Jul 2020 23:37:08 +0300
Subject: [PATCH 416/502] io_uring: share completion list w/ per-op space

Calling io_req_complete(req) means that the request is done, and there
is nothing left but to clean it up. That also means that per-op data
after that should not be used, so we're free to reuse it in completion
path, e.g. to store overflow_list as done in this patch.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 27 ++++++++++++++++++++-------
 1 file changed, 20 insertions(+), 7 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 64ae5b681c62..3cadd5f963b7 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -487,6 +487,11 @@ struct io_statx {
 	struct statx __user		*buffer;
 };
 
+struct io_completion {
+	struct file			*file;
+	struct list_head		list;
+};
+
 struct io_async_connect {
 	struct sockaddr_storage		address;
 };
@@ -622,6 +627,8 @@ struct io_kiocb {
 		struct io_splice	splice;
 		struct io_provide_buf	pbuf;
 		struct io_statx		statx;
+		/* use only after cleaning per-op data, see io_clean_op() */
+		struct io_completion	compl;
 	};
 
 	struct io_async_ctx		*io;
@@ -896,7 +903,7 @@ static int __io_sqe_files_update(struct io_ring_ctx *ctx,
 static int io_grab_files(struct io_kiocb *req);
 static void io_complete_rw_common(struct kiocb *kiocb, long res,
 				  struct io_comp_state *cs);
-static void io_cleanup_req(struct io_kiocb *req);
+static void __io_clean_op(struct io_kiocb *req);
 static int io_file_get(struct io_submit_state *state, struct io_kiocb *req,
 		       int fd, struct file **out_file, bool fixed);
 static void __io_queue_sqe(struct io_kiocb *req,
@@ -936,6 +943,12 @@ static void io_get_req_task(struct io_kiocb *req)
 	req->flags |= REQ_F_TASK_PINNED;
 }
 
+static inline void io_clean_op(struct io_kiocb *req)
+{
+	if (req->flags & REQ_F_NEED_CLEANUP)
+		__io_clean_op(req);
+}
+
 /* not idempotent -- it doesn't clear REQ_F_TASK_PINNED */
 static void __io_put_req_task(struct io_kiocb *req)
 {
@@ -1413,8 +1426,8 @@ static void io_submit_flush_completions(struct io_comp_state *cs)
 	while (!list_empty(&cs->list)) {
 		struct io_kiocb *req;
 
-		req = list_first_entry(&cs->list, struct io_kiocb, list);
-		list_del(&req->list);
+		req = list_first_entry(&cs->list, struct io_kiocb, compl.list);
+		list_del(&req->compl.list);
 		__io_cqring_fill_event(req, req->result, req->cflags);
 		if (!(req->flags & REQ_F_LINK_HEAD)) {
 			req->flags |= REQ_F_COMP_LOCKED;
@@ -1439,9 +1452,10 @@ static void __io_req_complete(struct io_kiocb *req, long res, unsigned cflags,
 		io_cqring_add_event(req, res, cflags);
 		io_put_req(req);
 	} else {
+		io_clean_op(req);
 		req->result = res;
 		req->cflags = cflags;
-		list_add_tail(&req->list, &cs->list);
+		list_add_tail(&req->compl.list, &cs->list);
 		if (++cs->nr >= 32)
 			io_submit_flush_completions(cs);
 	}
@@ -1515,8 +1529,7 @@ static inline void io_put_file(struct io_kiocb *req, struct file *file,
 
 static void io_dismantle_req(struct io_kiocb *req)
 {
-	if (req->flags & REQ_F_NEED_CLEANUP)
-		io_cleanup_req(req);
+	io_clean_op(req);
 
 	if (req->io)
 		kfree(req->io);
@@ -5402,7 +5415,7 @@ static int io_req_defer(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 	return -EIOCBQUEUED;
 }
 
-static void io_cleanup_req(struct io_kiocb *req)
+static void __io_clean_op(struct io_kiocb *req)
 {
 	struct io_async_ctx *io = req->io;
 

From 540e32a0855e700affa29b1112bf2dbb1fa7702a Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Mon, 13 Jul 2020 23:37:09 +0300
Subject: [PATCH 417/502] io_uring: rename ctx->poll into ctx->iopoll

It supports both polling and I/O polling. Rename ctx->poll to clearly
show that it's only in I/O poll case.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 36 ++++++++++++++++++------------------
 1 file changed, 18 insertions(+), 18 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 3cadd5f963b7..c8ebd227c837 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -320,12 +320,12 @@ struct io_ring_ctx {
 		spinlock_t		completion_lock;
 
 		/*
-		 * ->poll_list is protected by the ctx->uring_lock for
+		 * ->iopoll_list is protected by the ctx->uring_lock for
 		 * io_uring instances that don't use IORING_SETUP_SQPOLL.
 		 * For SQPOLL, only the single threaded io_sq_thread() will
 		 * manipulate the list, hence no extra locking is needed there.
 		 */
-		struct list_head	poll_list;
+		struct list_head	iopoll_list;
 		struct hlist_head	*cancel_hash;
 		unsigned		cancel_hash_bits;
 		bool			poll_multi_file;
@@ -1064,7 +1064,7 @@ static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
 	mutex_init(&ctx->uring_lock);
 	init_waitqueue_head(&ctx->wait);
 	spin_lock_init(&ctx->completion_lock);
-	INIT_LIST_HEAD(&ctx->poll_list);
+	INIT_LIST_HEAD(&ctx->iopoll_list);
 	INIT_LIST_HEAD(&ctx->defer_list);
 	INIT_LIST_HEAD(&ctx->timeout_list);
 	init_waitqueue_head(&ctx->inflight_wait);
@@ -2009,7 +2009,7 @@ static int io_do_iopoll(struct io_ring_ctx *ctx, unsigned int *nr_events,
 	spin = !ctx->poll_multi_file && *nr_events < min;
 
 	ret = 0;
-	list_for_each_entry_safe(req, tmp, &ctx->poll_list, list) {
+	list_for_each_entry_safe(req, tmp, &ctx->iopoll_list, list) {
 		struct kiocb *kiocb = &req->rw.kiocb;
 
 		/*
@@ -2051,7 +2051,7 @@ static int io_do_iopoll(struct io_ring_ctx *ctx, unsigned int *nr_events,
 static int io_iopoll_getevents(struct io_ring_ctx *ctx, unsigned int *nr_events,
 				long min)
 {
-	while (!list_empty(&ctx->poll_list) && !need_resched()) {
+	while (!list_empty(&ctx->iopoll_list) && !need_resched()) {
 		int ret;
 
 		ret = io_do_iopoll(ctx, nr_events, min);
@@ -2074,7 +2074,7 @@ static void io_iopoll_try_reap_events(struct io_ring_ctx *ctx)
 		return;
 
 	mutex_lock(&ctx->uring_lock);
-	while (!list_empty(&ctx->poll_list)) {
+	while (!list_empty(&ctx->iopoll_list)) {
 		unsigned int nr_events = 0;
 
 		io_do_iopoll(ctx, &nr_events, 0);
@@ -2291,12 +2291,12 @@ static void io_iopoll_req_issued(struct io_kiocb *req)
 	 * how we do polling eventually, not spinning if we're on potentially
 	 * different devices.
 	 */
-	if (list_empty(&ctx->poll_list)) {
+	if (list_empty(&ctx->iopoll_list)) {
 		ctx->poll_multi_file = false;
 	} else if (!ctx->poll_multi_file) {
 		struct io_kiocb *list_req;
 
-		list_req = list_first_entry(&ctx->poll_list, struct io_kiocb,
+		list_req = list_first_entry(&ctx->iopoll_list, struct io_kiocb,
 						list);
 		if (list_req->file != req->file)
 			ctx->poll_multi_file = true;
@@ -2307,9 +2307,9 @@ static void io_iopoll_req_issued(struct io_kiocb *req)
 	 * it to the front so we find it first.
 	 */
 	if (READ_ONCE(req->iopoll_completed))
-		list_add(&req->list, &ctx->poll_list);
+		list_add(&req->list, &ctx->iopoll_list);
 	else
-		list_add_tail(&req->list, &ctx->poll_list);
+		list_add_tail(&req->list, &ctx->iopoll_list);
 
 	if ((ctx->flags & IORING_SETUP_SQPOLL) &&
 	    wq_has_sleeper(&ctx->sqo_wait))
@@ -6329,11 +6329,11 @@ static int io_sq_thread(void *data)
 	while (!kthread_should_park()) {
 		unsigned int to_submit;
 
-		if (!list_empty(&ctx->poll_list)) {
+		if (!list_empty(&ctx->iopoll_list)) {
 			unsigned nr_events = 0;
 
 			mutex_lock(&ctx->uring_lock);
-			if (!list_empty(&ctx->poll_list) && !need_resched())
+			if (!list_empty(&ctx->iopoll_list) && !need_resched())
 				io_do_iopoll(ctx, &nr_events, 0);
 			else
 				timeout = jiffies + ctx->sq_thread_idle;
@@ -6362,7 +6362,7 @@ static int io_sq_thread(void *data)
 			 * more IO, we should wait for the application to
 			 * reap events and wake us up.
 			 */
-			if (!list_empty(&ctx->poll_list) || need_resched() ||
+			if (!list_empty(&ctx->iopoll_list) || need_resched() ||
 			    (!time_after(jiffies, timeout) && ret != -EBUSY &&
 			    !percpu_ref_is_dying(&ctx->refs))) {
 				io_run_task_work();
@@ -6375,13 +6375,13 @@ static int io_sq_thread(void *data)
 
 			/*
 			 * While doing polled IO, before going to sleep, we need
-			 * to check if there are new reqs added to poll_list, it
-			 * is because reqs may have been punted to io worker and
-			 * will be added to poll_list later, hence check the
-			 * poll_list again.
+			 * to check if there are new reqs added to iopoll_list,
+			 * it is because reqs may have been punted to io worker
+			 * and will be added to iopoll_list later, hence check
+			 * the iopoll_list again.
 			 */
 			if ((ctx->flags & IORING_SETUP_IOPOLL) &&
-			    !list_empty_careful(&ctx->poll_list)) {
+			    !list_empty_careful(&ctx->iopoll_list)) {
 				finish_wait(&ctx->sqo_wait, &wait);
 				continue;
 			}

From d21ffe7eca82d47b489760899912f81e30456e2e Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Mon, 13 Jul 2020 23:37:10 +0300
Subject: [PATCH 418/502] io_uring: use inflight_entry list for iopoll'ing

req->inflight_entry is used to track requests that grabbed files_struct.
Let's share it with iopoll list, because the only iopoll'ed ops are
reads and writes, which don't need a file table.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 26 +++++++++++++++-----------
 1 file changed, 15 insertions(+), 11 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index c8ebd227c837..8a89480a57ec 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -651,6 +651,10 @@ struct io_kiocb {
 
 	struct list_head	link_list;
 
+	/*
+	 * 1. used with ctx->iopoll_list with reads/writes
+	 * 2. to track reqs with ->files (see io_op_def::file_table)
+	 */
 	struct list_head	inflight_entry;
 
 	struct percpu_ref	*fixed_file_refs;
@@ -1943,8 +1947,8 @@ static void io_iopoll_queue(struct list_head *again)
 	struct io_kiocb *req;
 
 	do {
-		req = list_first_entry(again, struct io_kiocb, list);
-		list_del(&req->list);
+		req = list_first_entry(again, struct io_kiocb, inflight_entry);
+		list_del(&req->inflight_entry);
 		if (!io_rw_reissue(req, -EAGAIN))
 			io_complete_rw_common(&req->rw.kiocb, -EAGAIN, NULL);
 	} while (!list_empty(again));
@@ -1967,13 +1971,13 @@ static void io_iopoll_complete(struct io_ring_ctx *ctx, unsigned int *nr_events,
 	while (!list_empty(done)) {
 		int cflags = 0;
 
-		req = list_first_entry(done, struct io_kiocb, list);
+		req = list_first_entry(done, struct io_kiocb, inflight_entry);
 		if (READ_ONCE(req->result) == -EAGAIN) {
 			req->iopoll_completed = 0;
-			list_move_tail(&req->list, &again);
+			list_move_tail(&req->inflight_entry, &again);
 			continue;
 		}
-		list_del(&req->list);
+		list_del(&req->inflight_entry);
 
 		if (req->flags & REQ_F_BUFFER_SELECTED)
 			cflags = io_put_kbuf(req);
@@ -2009,7 +2013,7 @@ static int io_do_iopoll(struct io_ring_ctx *ctx, unsigned int *nr_events,
 	spin = !ctx->poll_multi_file && *nr_events < min;
 
 	ret = 0;
-	list_for_each_entry_safe(req, tmp, &ctx->iopoll_list, list) {
+	list_for_each_entry_safe(req, tmp, &ctx->iopoll_list, inflight_entry) {
 		struct kiocb *kiocb = &req->rw.kiocb;
 
 		/*
@@ -2018,7 +2022,7 @@ static int io_do_iopoll(struct io_ring_ctx *ctx, unsigned int *nr_events,
 		 * and complete those lists first, if we have entries there.
 		 */
 		if (READ_ONCE(req->iopoll_completed)) {
-			list_move_tail(&req->list, &done);
+			list_move_tail(&req->inflight_entry, &done);
 			continue;
 		}
 		if (!list_empty(&done))
@@ -2030,7 +2034,7 @@ static int io_do_iopoll(struct io_ring_ctx *ctx, unsigned int *nr_events,
 
 		/* iopoll may have completed current req */
 		if (READ_ONCE(req->iopoll_completed))
-			list_move_tail(&req->list, &done);
+			list_move_tail(&req->inflight_entry, &done);
 
 		if (ret && spin)
 			spin = false;
@@ -2297,7 +2301,7 @@ static void io_iopoll_req_issued(struct io_kiocb *req)
 		struct io_kiocb *list_req;
 
 		list_req = list_first_entry(&ctx->iopoll_list, struct io_kiocb,
-						list);
+						inflight_entry);
 		if (list_req->file != req->file)
 			ctx->poll_multi_file = true;
 	}
@@ -2307,9 +2311,9 @@ static void io_iopoll_req_issued(struct io_kiocb *req)
 	 * it to the front so we find it first.
 	 */
 	if (READ_ONCE(req->iopoll_completed))
-		list_add(&req->list, &ctx->iopoll_list);
+		list_add(&req->inflight_entry, &ctx->iopoll_list);
 	else
-		list_add_tail(&req->list, &ctx->iopoll_list);
+		list_add_tail(&req->inflight_entry, &ctx->iopoll_list);
 
 	if ((ctx->flags & IORING_SETUP_SQPOLL) &&
 	    wq_has_sleeper(&ctx->sqo_wait))

From 40d8ddd4facb80760d5a0c61a7cf026d5ff73ff0 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Mon, 13 Jul 2020 23:37:11 +0300
Subject: [PATCH 419/502] io_uring: use completion list for CQ overflow

As with the completion path, also use compl.list for overflowed
requests. If cleaned up properly, nobody needs per-op data there
anymore.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 8a89480a57ec..2122b37e68e3 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -1339,8 +1339,8 @@ static bool io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force)
 			break;
 
 		req = list_first_entry(&ctx->cq_overflow_list, struct io_kiocb,
-						list);
-		list_move(&req->list, &list);
+						compl.list);
+		list_move(&req->compl.list, &list);
 		req->flags &= ~REQ_F_OVERFLOW;
 		if (cqe) {
 			WRITE_ONCE(cqe->user_data, req->user_data);
@@ -1362,8 +1362,8 @@ static bool io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force)
 	io_cqring_ev_posted(ctx);
 
 	while (!list_empty(&list)) {
-		req = list_first_entry(&list, struct io_kiocb, list);
-		list_del(&req->list);
+		req = list_first_entry(&list, struct io_kiocb, compl.list);
+		list_del(&req->compl.list);
 		io_put_req(req);
 	}
 
@@ -1396,11 +1396,12 @@ static void __io_cqring_fill_event(struct io_kiocb *req, long res, long cflags)
 			set_bit(0, &ctx->cq_check_overflow);
 			ctx->rings->sq_flags |= IORING_SQ_CQ_OVERFLOW;
 		}
+		io_clean_op(req);
 		req->flags |= REQ_F_OVERFLOW;
-		refcount_inc(&req->refs);
 		req->result = res;
 		req->cflags = cflags;
-		list_add_tail(&req->list, &ctx->cq_overflow_list);
+		refcount_inc(&req->refs);
+		list_add_tail(&req->compl.list, &ctx->cq_overflow_list);
 	}
 }
 
@@ -7835,7 +7836,7 @@ static void io_uring_cancel_files(struct io_ring_ctx *ctx,
 
 		if (cancel_req->flags & REQ_F_OVERFLOW) {
 			spin_lock_irq(&ctx->completion_lock);
-			list_del(&cancel_req->list);
+			list_del(&cancel_req->compl.list);
 			cancel_req->flags &= ~REQ_F_OVERFLOW;
 			if (list_empty(&ctx->cq_overflow_list)) {
 				clear_bit(0, &ctx->sq_check_overflow);

From 135fcde8496b03d31648171dbc038990112e41d5 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Mon, 13 Jul 2020 23:37:12 +0300
Subject: [PATCH 420/502] io_uring: add req->timeout.list

Instead of using shared req->list, hang timeouts up on their own list
entry. struct io_timeout have enough extra space for it, but if that
will be a problem ->inflight_entry can reused for that.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 22 ++++++++++++----------
 1 file changed, 12 insertions(+), 10 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 2122b37e68e3..2544795cfd30 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -396,6 +396,7 @@ struct io_timeout {
 	int				flags;
 	u32				off;
 	u32				target_seq;
+	struct list_head		list;
 };
 
 struct io_rw {
@@ -1213,7 +1214,7 @@ static void io_kill_timeout(struct io_kiocb *req)
 	ret = hrtimer_try_to_cancel(&req->io->timeout.timer);
 	if (ret != -1) {
 		atomic_inc(&req->ctx->cq_timeouts);
-		list_del_init(&req->list);
+		list_del_init(&req->timeout.list);
 		req->flags |= REQ_F_COMP_LOCKED;
 		io_cqring_fill_event(req, 0);
 		io_put_req(req);
@@ -1225,7 +1226,7 @@ static void io_kill_timeouts(struct io_ring_ctx *ctx)
 	struct io_kiocb *req, *tmp;
 
 	spin_lock_irq(&ctx->completion_lock);
-	list_for_each_entry_safe(req, tmp, &ctx->timeout_list, list)
+	list_for_each_entry_safe(req, tmp, &ctx->timeout_list, timeout.list)
 		io_kill_timeout(req);
 	spin_unlock_irq(&ctx->completion_lock);
 }
@@ -1248,7 +1249,7 @@ static void io_flush_timeouts(struct io_ring_ctx *ctx)
 {
 	while (!list_empty(&ctx->timeout_list)) {
 		struct io_kiocb *req = list_first_entry(&ctx->timeout_list,
-							struct io_kiocb, list);
+						struct io_kiocb, timeout.list);
 
 		if (io_is_timeout_noseq(req))
 			break;
@@ -1256,7 +1257,7 @@ static void io_flush_timeouts(struct io_ring_ctx *ctx)
 					- atomic_read(&ctx->cq_timeouts))
 			break;
 
-		list_del_init(&req->list);
+		list_del_init(&req->timeout.list);
 		io_kill_timeout(req);
 	}
 }
@@ -4997,8 +4998,8 @@ static enum hrtimer_restart io_timeout_fn(struct hrtimer *timer)
 	 * We could be racing with timeout deletion. If the list is empty,
 	 * then timeout lookup already found it and will be handling it.
 	 */
-	if (!list_empty(&req->list))
-		list_del_init(&req->list);
+	if (!list_empty(&req->timeout.list))
+		list_del_init(&req->timeout.list);
 
 	io_cqring_fill_event(req, -ETIME);
 	io_commit_cqring(ctx);
@@ -5015,9 +5016,9 @@ static int io_timeout_cancel(struct io_ring_ctx *ctx, __u64 user_data)
 	struct io_kiocb *req;
 	int ret = -ENOENT;
 
-	list_for_each_entry(req, &ctx->timeout_list, list) {
+	list_for_each_entry(req, &ctx->timeout_list, timeout.list) {
 		if (user_data == req->user_data) {
-			list_del_init(&req->list);
+			list_del_init(&req->timeout.list);
 			ret = 0;
 			break;
 		}
@@ -5139,7 +5140,8 @@ static int io_timeout(struct io_kiocb *req)
 	 * the one we need first.
 	 */
 	list_for_each_prev(entry, &ctx->timeout_list) {
-		struct io_kiocb *nxt = list_entry(entry, struct io_kiocb, list);
+		struct io_kiocb *nxt = list_entry(entry, struct io_kiocb,
+						  timeout.list);
 
 		if (io_is_timeout_noseq(nxt))
 			continue;
@@ -5148,7 +5150,7 @@ static int io_timeout(struct io_kiocb *req)
 			break;
 	}
 add:
-	list_add(&req->list, entry);
+	list_add(&req->timeout.list, entry);
 	data->timer.function = io_timeout_fn;
 	hrtimer_start(&data->timer, timespec64_to_ktime(data->ts), data->mode);
 	spin_unlock_irq(&ctx->completion_lock);

From 7d6ddea6beaf6639cf3a2b291dcdac6fe1edc584 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Mon, 13 Jul 2020 23:37:13 +0300
Subject: [PATCH 421/502] io_uring: remove init for unused list

poll*() doesn't use req->list, don't init it.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 2544795cfd30..1e4ac48b1557 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -4964,7 +4964,6 @@ static int io_poll_add(struct io_kiocb *req)
 	req->flags &= ~REQ_F_WORK_INITIALIZED;
 
 	INIT_HLIST_NODE(&req->hash_node);
-	INIT_LIST_HEAD(&req->list);
 	ipt.pt._qproc = io_poll_queue_proc;
 
 	mask = __io_arm_poll_handler(req, &req->poll, &ipt, poll->events,

From 27dc8338e5fb0e0ed5b272e792f4ffad7f3bc03e Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Mon, 13 Jul 2020 23:37:14 +0300
Subject: [PATCH 422/502] io_uring: use non-intrusive list for defer

The only left user of req->list is DRAIN, hence instead of keeping a
separate per request list for it, do that with old fashion non-intrusive
lists allocated on demand. That's a really slow path, so that's OK.

This removes req->list and so sheds 16 bytes from io_kiocb.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 25 ++++++++++++++++++-------
 1 file changed, 18 insertions(+), 7 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 1e4ac48b1557..6e6e71310785 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -641,7 +641,6 @@ struct io_kiocb {
 	u16				buf_index;
 
 	struct io_ring_ctx	*ctx;
-	struct list_head	list;
 	unsigned int		flags;
 	refcount_t		refs;
 	struct task_struct	*task;
@@ -676,6 +675,11 @@ struct io_kiocb {
 	struct callback_head	task_work;
 };
 
+struct io_defer_entry {
+	struct list_head	list;
+	struct io_kiocb		*req;
+};
+
 #define IO_IOPOLL_BATCH			8
 
 struct io_comp_state {
@@ -1234,14 +1238,15 @@ static void io_kill_timeouts(struct io_ring_ctx *ctx)
 static void __io_queue_deferred(struct io_ring_ctx *ctx)
 {
 	do {
-		struct io_kiocb *req = list_first_entry(&ctx->defer_list,
-							struct io_kiocb, list);
+		struct io_defer_entry *de = list_first_entry(&ctx->defer_list,
+						struct io_defer_entry, list);
 
-		if (req_need_defer(req))
+		if (req_need_defer(de->req))
 			break;
-		list_del_init(&req->list);
+		list_del_init(&de->list);
 		/* punt-init is done before queueing for defer */
-		__io_queue_async_work(req);
+		__io_queue_async_work(de->req);
+		kfree(de);
 	} while (!list_empty(&ctx->defer_list));
 }
 
@@ -5394,6 +5399,7 @@ static int io_req_defer_prep(struct io_kiocb *req,
 static int io_req_defer(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 {
 	struct io_ring_ctx *ctx = req->ctx;
+	struct io_defer_entry *de;
 	int ret;
 
 	/* Still need defer if there is pending req in defer list. */
@@ -5408,15 +5414,20 @@ static int io_req_defer(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 			return ret;
 	}
 	io_prep_async_link(req);
+	de = kmalloc(sizeof(*de), GFP_KERNEL);
+	if (!de)
+		return -ENOMEM;
 
 	spin_lock_irq(&ctx->completion_lock);
 	if (!req_need_defer(req) && list_empty(&ctx->defer_list)) {
 		spin_unlock_irq(&ctx->completion_lock);
+		kfree(de);
 		return 0;
 	}
 
 	trace_io_uring_defer(ctx, req, req->user_data);
-	list_add_tail(&req->list, &ctx->defer_list);
+	de->req = req;
+	list_add_tail(&de->list, &ctx->defer_list);
 	spin_unlock_irq(&ctx->completion_lock);
 	return -EIOCBQUEUED;
 }

From 9cf7c104deaef52d6fd7c103a716e31d9815ede8 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Mon, 13 Jul 2020 23:37:15 +0300
Subject: [PATCH 423/502] io_uring: remove sequence from io_kiocb

req->sequence is used only for deferred (i.e. DRAIN) requests, but
initialised for every request. Remove req->sequence from io_kiocb
together with its initialisation in io_init_req().

Replace it with a new field in struct io_defer_entry, that will be
calculated only when needed in io_req_defer(), which is a slow path.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 44 ++++++++++++++++++++++++++++++--------------
 1 file changed, 30 insertions(+), 14 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 6e6e71310785..efa132831f3d 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -639,6 +639,7 @@ struct io_kiocb {
 	u8				iopoll_completed;
 
 	u16				buf_index;
+	u32				result;
 
 	struct io_ring_ctx	*ctx;
 	unsigned int		flags;
@@ -646,8 +647,6 @@ struct io_kiocb {
 	struct task_struct	*task;
 	unsigned long		fsize;
 	u64			user_data;
-	u32			result;
-	u32			sequence;
 
 	struct list_head	link_list;
 
@@ -678,6 +677,7 @@ struct io_kiocb {
 struct io_defer_entry {
 	struct list_head	list;
 	struct io_kiocb		*req;
+	u32			seq;
 };
 
 #define IO_IOPOLL_BATCH			8
@@ -1090,13 +1090,13 @@ err:
 	return NULL;
 }
 
-static inline bool req_need_defer(struct io_kiocb *req)
+static bool req_need_defer(struct io_kiocb *req, u32 seq)
 {
 	if (unlikely(req->flags & REQ_F_IO_DRAIN)) {
 		struct io_ring_ctx *ctx = req->ctx;
 
-		return req->sequence != ctx->cached_cq_tail
-					+ atomic_read(&ctx->cached_cq_overflow);
+		return seq != ctx->cached_cq_tail
+				+ atomic_read(&ctx->cached_cq_overflow);
 	}
 
 	return false;
@@ -1241,7 +1241,7 @@ static void __io_queue_deferred(struct io_ring_ctx *ctx)
 		struct io_defer_entry *de = list_first_entry(&ctx->defer_list,
 						struct io_defer_entry, list);
 
-		if (req_need_defer(de->req))
+		if (req_need_defer(de->req, de->seq))
 			break;
 		list_del_init(&de->list);
 		/* punt-init is done before queueing for defer */
@@ -5396,14 +5396,35 @@ static int io_req_defer_prep(struct io_kiocb *req,
 	return ret;
 }
 
+static u32 io_get_sequence(struct io_kiocb *req)
+{
+	struct io_kiocb *pos;
+	struct io_ring_ctx *ctx = req->ctx;
+	u32 total_submitted, nr_reqs = 1;
+
+	if (req->flags & REQ_F_LINK_HEAD)
+		list_for_each_entry(pos, &req->link_list, link_list)
+			nr_reqs++;
+
+	total_submitted = ctx->cached_sq_head - ctx->cached_sq_dropped;
+	return total_submitted - nr_reqs;
+}
+
 static int io_req_defer(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 {
 	struct io_ring_ctx *ctx = req->ctx;
 	struct io_defer_entry *de;
 	int ret;
+	u32 seq;
 
 	/* Still need defer if there is pending req in defer list. */
-	if (!req_need_defer(req) && list_empty_careful(&ctx->defer_list))
+	if (likely(list_empty_careful(&ctx->defer_list) &&
+		!(req->flags & REQ_F_IO_DRAIN)))
+		return 0;
+
+	seq = io_get_sequence(req);
+	/* Still a chance to pass the sequence check */
+	if (!req_need_defer(req, seq) && list_empty_careful(&ctx->defer_list))
 		return 0;
 
 	if (!req->io) {
@@ -5419,7 +5440,7 @@ static int io_req_defer(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 		return -ENOMEM;
 
 	spin_lock_irq(&ctx->completion_lock);
-	if (!req_need_defer(req) && list_empty(&ctx->defer_list)) {
+	if (!req_need_defer(req, seq) && list_empty(&ctx->defer_list)) {
 		spin_unlock_irq(&ctx->completion_lock);
 		kfree(de);
 		return 0;
@@ -5427,6 +5448,7 @@ static int io_req_defer(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 
 	trace_io_uring_defer(ctx, req, req->user_data);
 	de->req = req;
+	de->seq = seq;
 	list_add_tail(&de->list, &ctx->defer_list);
 	spin_unlock_irq(&ctx->completion_lock);
 	return -EIOCBQUEUED;
@@ -6204,12 +6226,6 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req,
 	unsigned int sqe_flags;
 	int id;
 
-	/*
-	 * All io need record the previous position, if LINK vs DARIN,
-	 * it can be used to mark the position of the first IO in the
-	 * link list.
-	 */
-	req->sequence = ctx->cached_sq_head - ctx->cached_sq_dropped;
 	req->opcode = READ_ONCE(sqe->opcode);
 	req->user_data = READ_ONCE(sqe->user_data);
 	req->io = NULL;

From 0f7e466b393abab86be96ffcf00af383afddc0d1 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Mon, 13 Jul 2020 23:37:16 +0300
Subject: [PATCH 424/502] io_uring: place cflags into completion data

req->cflags is used only for defer-completion path, just use completion
data to store it. With the 4 bytes from the ->sequence patch and
compacting io_kiocb, this frees 8 bytes.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index efa132831f3d..4d0fd9ddd3dc 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -491,6 +491,7 @@ struct io_statx {
 struct io_completion {
 	struct file			*file;
 	struct list_head		list;
+	int				cflags;
 };
 
 struct io_async_connect {
@@ -633,7 +634,6 @@ struct io_kiocb {
 	};
 
 	struct io_async_ctx		*io;
-	int				cflags;
 	u8				opcode;
 	/* polled IO has completed */
 	u8				iopoll_completed;
@@ -1351,7 +1351,7 @@ static bool io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force)
 		if (cqe) {
 			WRITE_ONCE(cqe->user_data, req->user_data);
 			WRITE_ONCE(cqe->res, req->result);
-			WRITE_ONCE(cqe->flags, req->cflags);
+			WRITE_ONCE(cqe->flags, req->compl.cflags);
 		} else {
 			WRITE_ONCE(ctx->rings->cq_overflow,
 				atomic_inc_return(&ctx->cached_cq_overflow));
@@ -1405,7 +1405,7 @@ static void __io_cqring_fill_event(struct io_kiocb *req, long res, long cflags)
 		io_clean_op(req);
 		req->flags |= REQ_F_OVERFLOW;
 		req->result = res;
-		req->cflags = cflags;
+		req->compl.cflags = cflags;
 		refcount_inc(&req->refs);
 		list_add_tail(&req->compl.list, &ctx->cq_overflow_list);
 	}
@@ -1439,7 +1439,7 @@ static void io_submit_flush_completions(struct io_comp_state *cs)
 
 		req = list_first_entry(&cs->list, struct io_kiocb, compl.list);
 		list_del(&req->compl.list);
-		__io_cqring_fill_event(req, req->result, req->cflags);
+		__io_cqring_fill_event(req, req->result, req->compl.cflags);
 		if (!(req->flags & REQ_F_LINK_HEAD)) {
 			req->flags |= REQ_F_COMP_LOCKED;
 			io_put_req(req);
@@ -1465,7 +1465,7 @@ static void __io_req_complete(struct io_kiocb *req, long res, unsigned cflags,
 	} else {
 		io_clean_op(req);
 		req->result = res;
-		req->cflags = cflags;
+		req->compl.cflags = cflags;
 		list_add_tail(&req->compl.list, &cs->list);
 		if (++cs->nr >= 32)
 			io_submit_flush_completions(cs);

From dca9cf8b87f55c96f072c1fc6bc90e2b97a8e19f Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Wed, 15 Jul 2020 12:46:49 +0300
Subject: [PATCH 425/502] io_uring: inline io_req_work_grab_env()

The only caller of io_req_work_grab_env() is io_prep_async_work(), and
they are both initialising req->work. Inline grab_env(), it's easier
to keep this way, moreover there already were bugs with misplacing
io_req_init_async().

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 50 ++++++++++++++++++++------------------------------
 1 file changed, 20 insertions(+), 30 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 4d0fd9ddd3dc..a06d5b9cc046 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -1115,31 +1115,7 @@ static void __io_commit_cqring(struct io_ring_ctx *ctx)
 	}
 }
 
-static void io_req_work_grab_env(struct io_kiocb *req)
-{
-	const struct io_op_def *def = &io_op_defs[req->opcode];
-
-	io_req_init_async(req);
-
-	if (!req->work.mm && def->needs_mm) {
-		mmgrab(current->mm);
-		req->work.mm = current->mm;
-	}
-	if (!req->work.creds)
-		req->work.creds = get_current_cred();
-	if (!req->work.fs && def->needs_fs) {
-		spin_lock(&current->fs->lock);
-		if (!current->fs->in_exec) {
-			req->work.fs = current->fs;
-			req->work.fs->users++;
-		} else {
-			req->work.flags |= IO_WQ_WORK_CANCEL;
-		}
-		spin_unlock(&current->fs->lock);
-	}
-}
-
-static inline void io_req_work_drop_env(struct io_kiocb *req)
+static void io_req_clean_work(struct io_kiocb *req)
 {
 	if (!(req->flags & REQ_F_WORK_INITIALIZED))
 		return;
@@ -1177,8 +1153,22 @@ static void io_prep_async_work(struct io_kiocb *req)
 		if (def->unbound_nonreg_file)
 			req->work.flags |= IO_WQ_WORK_UNBOUND;
 	}
-
-	io_req_work_grab_env(req);
+	if (!req->work.mm && def->needs_mm) {
+		mmgrab(current->mm);
+		req->work.mm = current->mm;
+	}
+	if (!req->work.creds)
+		req->work.creds = get_current_cred();
+	if (!req->work.fs && def->needs_fs) {
+		spin_lock(&current->fs->lock);
+		if (!current->fs->in_exec) {
+			req->work.fs = current->fs;
+			req->work.fs->users++;
+		} else {
+			req->work.flags |= IO_WQ_WORK_CANCEL;
+		}
+		spin_unlock(&current->fs->lock);
+	}
 }
 
 static void io_prep_async_link(struct io_kiocb *req)
@@ -1547,7 +1537,7 @@ static void io_dismantle_req(struct io_kiocb *req)
 	if (req->file)
 		io_put_file(req, req->file, (req->flags & REQ_F_FIXED_FILE));
 	__io_put_req_task(req);
-	io_req_work_drop_env(req);
+	io_req_clean_work(req);
 
 	if (req->flags & REQ_F_INFLIGHT) {
 		struct io_ring_ctx *ctx = req->ctx;
@@ -4825,7 +4815,7 @@ static bool io_poll_remove_one(struct io_kiocb *req)
 			io_put_req(req);
 			/*
 			 * restore ->work because we will call
-			 * io_req_work_drop_env below when dropping the
+			 * io_req_clean_work below when dropping the
 			 * final reference.
 			 */
 			if (req->flags & REQ_F_WORK_INITIALIZED)
@@ -4965,7 +4955,7 @@ static int io_poll_add(struct io_kiocb *req)
 	__poll_t mask;
 
 	/* ->work is in union with hash_node and others */
-	io_req_work_drop_env(req);
+	io_req_clean_work(req);
 	req->flags &= ~REQ_F_WORK_INITIALIZED;
 
 	INIT_HLIST_NODE(&req->hash_node);

From 1c2da9e8839d6437b43f2c805411d1a0cbd70165 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Wed, 15 Jul 2020 12:46:50 +0300
Subject: [PATCH 426/502] io_uring: remove empty cleanup of OP_OPEN* reqs

A switch in __io_clean_op() doesn't have default, it's pointless to list
opcodes that doesn't do any cleanup. Remove IORING_OP_OPEN* from there.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index a06d5b9cc046..8d6f1c4e8dac 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -5473,9 +5473,6 @@ static void __io_clean_op(struct io_kiocb *req)
 		if (req->flags & REQ_F_BUFFER_SELECTED)
 			kfree(req->sr_msg.kbuf);
 		break;
-	case IORING_OP_OPENAT:
-	case IORING_OP_OPENAT2:
-		break;
 	case IORING_OP_SPLICE:
 	case IORING_OP_TEE:
 		io_put_file(req, req->splice.file_in,

From 327d6d968b195cfc48ff97c49b56520aac922f65 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Wed, 15 Jul 2020 12:46:51 +0300
Subject: [PATCH 427/502] io_uring: alloc ->io in io_req_defer_prep()

Every call to io_req_defer_prep() is prepended with allocating ->io,
just do that in the function. And while we're at it, mark error paths
with unlikey and replace "if (ret < 0)" with "if (ret)".

There is only one change in the observable behaviour, that's instead of
killing the head request right away on error, it postpones it until the
link is assembled, that looks more preferable.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 22 +++++++---------------
 1 file changed, 7 insertions(+), 15 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 8d6f1c4e8dac..6a1cd2aea018 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -5279,6 +5279,9 @@ static int io_req_defer_prep(struct io_kiocb *req,
 	if (!sqe)
 		return 0;
 
+	if (io_alloc_async_ctx(req))
+		return -EAGAIN;
+
 	if (io_op_defs[req->opcode].file_table) {
 		io_req_init_async(req);
 		ret = io_grab_files(req);
@@ -5418,10 +5421,8 @@ static int io_req_defer(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 		return 0;
 
 	if (!req->io) {
-		if (io_alloc_async_ctx(req))
-			return -EAGAIN;
 		ret = io_req_defer_prep(req, sqe);
-		if (ret < 0)
+		if (ret)
 			return ret;
 	}
 	io_prep_async_link(req);
@@ -6024,11 +6025,8 @@ fail_req:
 		}
 	} else if (req->flags & REQ_F_FORCE_ASYNC) {
 		if (!req->io) {
-			ret = -EAGAIN;
-			if (io_alloc_async_ctx(req))
-				goto fail_req;
 			ret = io_req_defer_prep(req, sqe);
-			if (unlikely(ret < 0))
+			if (unlikely(ret))
 				goto fail_req;
 		}
 
@@ -6081,11 +6079,8 @@ static int io_submit_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe,
 			head->flags |= REQ_F_IO_DRAIN;
 			ctx->drain_next = 1;
 		}
-		if (io_alloc_async_ctx(req))
-			return -EAGAIN;
-
 		ret = io_req_defer_prep(req, sqe);
-		if (ret) {
+		if (unlikely(ret)) {
 			/* fail even hard links since we don't submit */
 			head->flags |= REQ_F_FAIL_LINK;
 			return ret;
@@ -6108,11 +6103,8 @@ static int io_submit_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe,
 			req->flags |= REQ_F_LINK_HEAD;
 			INIT_LIST_HEAD(&req->link_list);
 
-			if (io_alloc_async_ctx(req))
-				return -EAGAIN;
-
 			ret = io_req_defer_prep(req, sqe);
-			if (ret)
+			if (unlikely(ret))
 				req->flags |= REQ_F_FAIL_LINK;
 			*link = req;
 		} else {

From 57f1a64958543fe18a7fe0addbfb31bb2ceeaea2 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Wed, 15 Jul 2020 12:46:52 +0300
Subject: [PATCH 428/502] io_uring/io-wq: move RLIMIT_FSIZE to io-wq

RLIMIT_SIZE in needed only for execution from an io-wq context, hence
move all preparations from hot path to io-wq work setup.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io-wq.c    |  1 +
 fs/io-wq.h    |  1 +
 fs/io_uring.c | 22 +++++++++-------------
 3 files changed, 11 insertions(+), 13 deletions(-)

diff --git a/fs/io-wq.c b/fs/io-wq.c
index 72f759e1d6eb..8702d3c3b291 100644
--- a/fs/io-wq.c
+++ b/fs/io-wq.c
@@ -462,6 +462,7 @@ static void io_impersonate_work(struct io_worker *worker,
 		io_wq_switch_mm(worker, work);
 	if (worker->cur_creds != work->creds)
 		io_wq_switch_creds(worker, work);
+	current->signal->rlim[RLIMIT_FSIZE].rlim_cur = work->fsize;
 }
 
 static void io_assign_current_work(struct io_worker *worker,
diff --git a/fs/io-wq.h b/fs/io-wq.h
index 114f12ec2d65..ddaf9614cf9b 100644
--- a/fs/io-wq.h
+++ b/fs/io-wq.h
@@ -89,6 +89,7 @@ struct io_wq_work {
 	struct mm_struct *mm;
 	const struct cred *creds;
 	struct fs_struct *fs;
+	unsigned long fsize;
 	unsigned flags;
 };
 
diff --git a/fs/io_uring.c b/fs/io_uring.c
index 6a1cd2aea018..8b2f7a1bbd06 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -645,7 +645,6 @@ struct io_kiocb {
 	unsigned int		flags;
 	refcount_t		refs;
 	struct task_struct	*task;
-	unsigned long		fsize;
 	u64			user_data;
 
 	struct list_head	link_list;
@@ -736,6 +735,7 @@ struct io_op_def {
 	unsigned		pollout : 1;
 	/* op supports buffer selection */
 	unsigned		buffer_select : 1;
+	unsigned		needs_fsize : 1;
 };
 
 static const struct io_op_def io_op_defs[] = {
@@ -755,6 +755,7 @@ static const struct io_op_def io_op_defs[] = {
 		.hash_reg_file		= 1,
 		.unbound_nonreg_file	= 1,
 		.pollout		= 1,
+		.needs_fsize		= 1,
 	},
 	[IORING_OP_FSYNC] = {
 		.needs_file		= 1,
@@ -769,6 +770,7 @@ static const struct io_op_def io_op_defs[] = {
 		.hash_reg_file		= 1,
 		.unbound_nonreg_file	= 1,
 		.pollout		= 1,
+		.needs_fsize		= 1,
 	},
 	[IORING_OP_POLL_ADD] = {
 		.needs_file		= 1,
@@ -821,6 +823,7 @@ static const struct io_op_def io_op_defs[] = {
 	},
 	[IORING_OP_FALLOCATE] = {
 		.needs_file		= 1,
+		.needs_fsize		= 1,
 	},
 	[IORING_OP_OPENAT] = {
 		.file_table		= 1,
@@ -852,6 +855,7 @@ static const struct io_op_def io_op_defs[] = {
 		.needs_file		= 1,
 		.unbound_nonreg_file	= 1,
 		.pollout		= 1,
+		.needs_fsize		= 1,
 	},
 	[IORING_OP_FADVISE] = {
 		.needs_file		= 1,
@@ -1169,6 +1173,10 @@ static void io_prep_async_work(struct io_kiocb *req)
 		}
 		spin_unlock(&current->fs->lock);
 	}
+	if (def->needs_fsize)
+		req->work.fsize = rlimit(RLIMIT_FSIZE);
+	else
+		req->work.fsize = RLIM_INFINITY;
 }
 
 static void io_prep_async_link(struct io_kiocb *req)
@@ -3072,8 +3080,6 @@ static int io_write_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe,
 	if (unlikely(!(req->file->f_mode & FMODE_WRITE)))
 		return -EBADF;
 
-	req->fsize = rlimit(RLIMIT_FSIZE);
-
 	/* either don't need iovec imported or already have it */
 	if (!req->io || req->flags & REQ_F_NEED_CLEANUP)
 		return 0;
@@ -3130,17 +3136,11 @@ static int io_write(struct io_kiocb *req, bool force_nonblock,
 		}
 		kiocb->ki_flags |= IOCB_WRITE;
 
-		if (!force_nonblock)
-			current->signal->rlim[RLIMIT_FSIZE].rlim_cur = req->fsize;
-
 		if (req->file->f_op->write_iter)
 			ret2 = call_write_iter(req->file, kiocb, &iter);
 		else
 			ret2 = loop_rw_iter(WRITE, req->file, kiocb, &iter);
 
-		if (!force_nonblock)
-			current->signal->rlim[RLIMIT_FSIZE].rlim_cur = RLIM_INFINITY;
-
 		/*
 		 * Raw bdev writes will return -EOPNOTSUPP for IOCB_NOWAIT. Just
 		 * retry them without IOCB_NOWAIT.
@@ -3335,7 +3335,6 @@ static int io_fallocate_prep(struct io_kiocb *req,
 	req->sync.off = READ_ONCE(sqe->off);
 	req->sync.len = READ_ONCE(sqe->addr);
 	req->sync.mode = READ_ONCE(sqe->len);
-	req->fsize = rlimit(RLIMIT_FSIZE);
 	return 0;
 }
 
@@ -3346,11 +3345,8 @@ static int io_fallocate(struct io_kiocb *req, bool force_nonblock)
 	/* fallocate always requiring blocking context */
 	if (force_nonblock)
 		return -EAGAIN;
-
-	current->signal->rlim[RLIMIT_FSIZE].rlim_cur = req->fsize;
 	ret = vfs_fallocate(req->file, req->sync.mode, req->sync.off,
 				req->sync.len);
-	current->signal->rlim[RLIMIT_FSIZE].rlim_cur = RLIM_INFINITY;
 	if (ret < 0)
 		req_set_fail_links(req);
 	io_req_complete(req, ret);

From 06ef3608b0eed673fcbc62cf74c8d3ad0007a337 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Thu, 16 Jul 2020 23:28:33 +0300
Subject: [PATCH 429/502] io_uring: simplify file ref tracking in submission
 state

Currently, file refs in struct io_submit_state are tracked with 2 vars:
@has_refs -- how many refs were initially taken
@used_refs -- number of refs used

Replace it with a single variable counting how many refs left at the
current moment.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 12 ++++--------
 1 file changed, 4 insertions(+), 8 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 8b2f7a1bbd06..28b47533454a 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -707,7 +707,6 @@ struct io_submit_state {
 	struct file		*file;
 	unsigned int		fd;
 	unsigned int		has_refs;
-	unsigned int		used_refs;
 	unsigned int		ios_left;
 };
 
@@ -2327,10 +2326,8 @@ static void io_iopoll_req_issued(struct io_kiocb *req)
 
 static void __io_state_file_put(struct io_submit_state *state)
 {
-	int diff = state->has_refs - state->used_refs;
-
-	if (diff)
-		fput_many(state->file, diff);
+	if (state->has_refs)
+		fput_many(state->file, state->has_refs);
 	state->file = NULL;
 }
 
@@ -2352,7 +2349,7 @@ static struct file *__io_file_get(struct io_submit_state *state, int fd)
 
 	if (state->file) {
 		if (state->fd == fd) {
-			state->used_refs++;
+			state->has_refs--;
 			state->ios_left--;
 			return state->file;
 		}
@@ -2363,9 +2360,8 @@ static struct file *__io_file_get(struct io_submit_state *state, int fd)
 		return NULL;
 
 	state->fd = fd;
-	state->has_refs = state->ios_left;
-	state->used_refs = 1;
 	state->ios_left--;
+	state->has_refs = state->ios_left;
 	return state->file;
 }
 

From 7a7cacba8b4560403615b04d57bdcd1f93f90f10 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Thu, 16 Jul 2020 23:27:59 +0300
Subject: [PATCH 430/502] io_uring: indent left {send,recv}[msg]()

Flip over "if (sock)" condition with return on error, the upper layer
will take care. That change will be handy later, but already removes
an extra jump from hot path.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 269 +++++++++++++++++++++++++-------------------------
 1 file changed, 133 insertions(+), 136 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 28b47533454a..264b1e5e2d54 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -3916,42 +3916,41 @@ static int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 static int io_sendmsg(struct io_kiocb *req, bool force_nonblock,
 		      struct io_comp_state *cs)
 {
-	struct io_async_msghdr *kmsg = NULL;
+	struct io_async_msghdr iomsg, *kmsg = NULL;
 	struct socket *sock;
+	unsigned flags;
 	int ret;
 
 	sock = sock_from_file(req->file, &ret);
-	if (sock) {
-		struct io_async_msghdr iomsg;
-		unsigned flags;
+	if (unlikely(!sock))
+		return ret;
 
-		if (req->io) {
-			kmsg = &req->io->msg;
-			kmsg->msg.msg_name = &req->io->msg.addr;
-			/* if iov is set, it's allocated already */
-			if (!kmsg->iov)
-				kmsg->iov = kmsg->fast_iov;
-			kmsg->msg.msg_iter.iov = kmsg->iov;
-		} else {
-			ret = io_sendmsg_copy_hdr(req, &iomsg);
-			if (ret)
-				return ret;
-			kmsg = &iomsg;
-		}
-
-		flags = req->sr_msg.msg_flags;
-		if (flags & MSG_DONTWAIT)
-			req->flags |= REQ_F_NOWAIT;
-		else if (force_nonblock)
-			flags |= MSG_DONTWAIT;
-
-		ret = __sys_sendmsg_sock(sock, &kmsg->msg, flags);
-		if (force_nonblock && ret == -EAGAIN)
-			return io_setup_async_msg(req, kmsg);
-		if (ret == -ERESTARTSYS)
-			ret = -EINTR;
+	if (req->io) {
+		kmsg = &req->io->msg;
+		kmsg->msg.msg_name = &req->io->msg.addr;
+		/* if iov is set, it's allocated already */
+		if (!kmsg->iov)
+			kmsg->iov = kmsg->fast_iov;
+		kmsg->msg.msg_iter.iov = kmsg->iov;
+	} else {
+		ret = io_sendmsg_copy_hdr(req, &iomsg);
+		if (ret)
+			return ret;
+		kmsg = &iomsg;
 	}
 
+	flags = req->sr_msg.msg_flags;
+	if (flags & MSG_DONTWAIT)
+		req->flags |= REQ_F_NOWAIT;
+	else if (force_nonblock)
+		flags |= MSG_DONTWAIT;
+
+	ret = __sys_sendmsg_sock(sock, &kmsg->msg, flags);
+	if (force_nonblock && ret == -EAGAIN)
+		return io_setup_async_msg(req, kmsg);
+	if (ret == -ERESTARTSYS)
+		ret = -EINTR;
+
 	if (kmsg && kmsg->iov != kmsg->fast_iov)
 		kfree(kmsg->iov);
 	req->flags &= ~REQ_F_NEED_CLEANUP;
@@ -3964,39 +3963,38 @@ static int io_sendmsg(struct io_kiocb *req, bool force_nonblock,
 static int io_send(struct io_kiocb *req, bool force_nonblock,
 		   struct io_comp_state *cs)
 {
+	struct io_sr_msg *sr = &req->sr_msg;
+	struct msghdr msg;
+	struct iovec iov;
 	struct socket *sock;
+	unsigned flags;
 	int ret;
 
 	sock = sock_from_file(req->file, &ret);
-	if (sock) {
-		struct io_sr_msg *sr = &req->sr_msg;
-		struct msghdr msg;
-		struct iovec iov;
-		unsigned flags;
+	if (unlikely(!sock))
+		return ret;
 
-		ret = import_single_range(WRITE, sr->buf, sr->len, &iov,
-						&msg.msg_iter);
-		if (ret)
-			return ret;
+	ret = import_single_range(WRITE, sr->buf, sr->len, &iov, &msg.msg_iter);
+	if (unlikely(ret))
+		return ret;
 
-		msg.msg_name = NULL;
-		msg.msg_control = NULL;
-		msg.msg_controllen = 0;
-		msg.msg_namelen = 0;
+	msg.msg_name = NULL;
+	msg.msg_control = NULL;
+	msg.msg_controllen = 0;
+	msg.msg_namelen = 0;
 
-		flags = req->sr_msg.msg_flags;
-		if (flags & MSG_DONTWAIT)
-			req->flags |= REQ_F_NOWAIT;
-		else if (force_nonblock)
-			flags |= MSG_DONTWAIT;
+	flags = req->sr_msg.msg_flags;
+	if (flags & MSG_DONTWAIT)
+		req->flags |= REQ_F_NOWAIT;
+	else if (force_nonblock)
+		flags |= MSG_DONTWAIT;
 
-		msg.msg_flags = flags;
-		ret = sock_sendmsg(sock, &msg);
-		if (force_nonblock && ret == -EAGAIN)
-			return -EAGAIN;
-		if (ret == -ERESTARTSYS)
-			ret = -EINTR;
-	}
+	msg.msg_flags = flags;
+	ret = sock_sendmsg(sock, &msg);
+	if (force_nonblock && ret == -EAGAIN)
+		return -EAGAIN;
+	if (ret == -ERESTARTSYS)
+		ret = -EINTR;
 
 	if (ret < 0)
 		req_set_fail_links(req);
@@ -4149,62 +4147,62 @@ static int io_recvmsg_prep(struct io_kiocb *req,
 static int io_recvmsg(struct io_kiocb *req, bool force_nonblock,
 		      struct io_comp_state *cs)
 {
-	struct io_async_msghdr *kmsg = NULL;
+	struct io_async_msghdr iomsg, *kmsg = NULL;
 	struct socket *sock;
+	struct io_buffer *kbuf;
+	unsigned flags;
 	int ret, cflags = 0;
 
 	sock = sock_from_file(req->file, &ret);
-	if (sock) {
-		struct io_buffer *kbuf;
-		struct io_async_msghdr iomsg;
-		unsigned flags;
+	if (unlikely(!sock))
+		return ret;
 
-		if (req->io) {
-			kmsg = &req->io->msg;
-			kmsg->msg.msg_name = &req->io->msg.addr;
-			/* if iov is set, it's allocated already */
-			if (!kmsg->iov)
-				kmsg->iov = kmsg->fast_iov;
-			kmsg->msg.msg_iter.iov = kmsg->iov;
-		} else {
-			ret = io_recvmsg_copy_hdr(req, &iomsg);
-			if (ret)
-				return ret;
-			kmsg = &iomsg;
-		}
-
-		kbuf = io_recv_buffer_select(req, &cflags, !force_nonblock);
-		if (IS_ERR(kbuf)) {
-			return PTR_ERR(kbuf);
-		} else if (kbuf) {
-			kmsg->fast_iov[0].iov_base = u64_to_user_ptr(kbuf->addr);
-			iov_iter_init(&kmsg->msg.msg_iter, READ, kmsg->iov,
-					1, req->sr_msg.len);
-		}
-
-		flags = req->sr_msg.msg_flags;
-		if (flags & MSG_DONTWAIT)
-			req->flags |= REQ_F_NOWAIT;
-		else if (force_nonblock)
-			flags |= MSG_DONTWAIT;
-
-		ret = __sys_recvmsg_sock(sock, &kmsg->msg, req->sr_msg.umsg,
-						kmsg->uaddr, flags);
-		if (force_nonblock && ret == -EAGAIN) {
-			ret = io_setup_async_msg(req, kmsg);
-			if (ret != -EAGAIN)
-				kfree(kbuf);
+	if (req->io) {
+		kmsg = &req->io->msg;
+		kmsg->msg.msg_name = &req->io->msg.addr;
+		/* if iov is set, it's allocated already */
+		if (!kmsg->iov)
+			kmsg->iov = kmsg->fast_iov;
+		kmsg->msg.msg_iter.iov = kmsg->iov;
+	} else {
+		ret = io_recvmsg_copy_hdr(req, &iomsg);
+		if (ret)
 			return ret;
-		}
-		if (ret == -ERESTARTSYS)
-			ret = -EINTR;
-		if (kbuf)
-			kfree(kbuf);
+		kmsg = &iomsg;
 	}
 
+	kbuf = io_recv_buffer_select(req, &cflags, !force_nonblock);
+	if (IS_ERR(kbuf)) {
+		return PTR_ERR(kbuf);
+	} else if (kbuf) {
+		kmsg->fast_iov[0].iov_base = u64_to_user_ptr(kbuf->addr);
+		iov_iter_init(&kmsg->msg.msg_iter, READ, kmsg->iov,
+				1, req->sr_msg.len);
+	}
+
+	flags = req->sr_msg.msg_flags;
+	if (flags & MSG_DONTWAIT)
+		req->flags |= REQ_F_NOWAIT;
+	else if (force_nonblock)
+		flags |= MSG_DONTWAIT;
+
+	ret = __sys_recvmsg_sock(sock, &kmsg->msg, req->sr_msg.umsg,
+					kmsg->uaddr, flags);
+	if (force_nonblock && ret == -EAGAIN) {
+		ret = io_setup_async_msg(req, kmsg);
+		if (ret != -EAGAIN)
+			kfree(kbuf);
+		return ret;
+	}
+	if (ret == -ERESTARTSYS)
+		ret = -EINTR;
+	if (kbuf)
+		kfree(kbuf);
+
 	if (kmsg && kmsg->iov != kmsg->fast_iov)
 		kfree(kmsg->iov);
 	req->flags &= ~REQ_F_NEED_CLEANUP;
+
 	if (ret < 0)
 		req_set_fail_links(req);
 	__io_req_complete(req, ret, cflags, cs);
@@ -4215,51 +4213,50 @@ static int io_recv(struct io_kiocb *req, bool force_nonblock,
 		   struct io_comp_state *cs)
 {
 	struct io_buffer *kbuf = NULL;
+	struct io_sr_msg *sr = &req->sr_msg;
+	struct msghdr msg;
+	void __user *buf = sr->buf;
 	struct socket *sock;
+	struct iovec iov;
+	unsigned flags;
 	int ret, cflags = 0;
 
 	sock = sock_from_file(req->file, &ret);
-	if (sock) {
-		struct io_sr_msg *sr = &req->sr_msg;
-		void __user *buf = sr->buf;
-		struct msghdr msg;
-		struct iovec iov;
-		unsigned flags;
+	if (unlikely(!sock))
+		return ret;
 
-		kbuf = io_recv_buffer_select(req, &cflags, !force_nonblock);
-		if (IS_ERR(kbuf))
-			return PTR_ERR(kbuf);
-		else if (kbuf)
-			buf = u64_to_user_ptr(kbuf->addr);
+	kbuf = io_recv_buffer_select(req, &cflags, !force_nonblock);
+	if (IS_ERR(kbuf))
+		return PTR_ERR(kbuf);
+	else if (kbuf)
+		buf = u64_to_user_ptr(kbuf->addr);
 
-		ret = import_single_range(READ, buf, sr->len, &iov,
-						&msg.msg_iter);
-		if (ret) {
-			kfree(kbuf);
-			return ret;
-		}
-
-		req->flags |= REQ_F_NEED_CLEANUP;
-		msg.msg_name = NULL;
-		msg.msg_control = NULL;
-		msg.msg_controllen = 0;
-		msg.msg_namelen = 0;
-		msg.msg_iocb = NULL;
-		msg.msg_flags = 0;
-
-		flags = req->sr_msg.msg_flags;
-		if (flags & MSG_DONTWAIT)
-			req->flags |= REQ_F_NOWAIT;
-		else if (force_nonblock)
-			flags |= MSG_DONTWAIT;
-
-		ret = sock_recvmsg(sock, &msg, flags);
-		if (force_nonblock && ret == -EAGAIN)
-			return -EAGAIN;
-		if (ret == -ERESTARTSYS)
-			ret = -EINTR;
+	ret = import_single_range(READ, buf, sr->len, &iov, &msg.msg_iter);
+	if (unlikely(ret)) {
+		kfree(kbuf);
+		return ret;
 	}
 
+	req->flags |= REQ_F_NEED_CLEANUP;
+	msg.msg_name = NULL;
+	msg.msg_control = NULL;
+	msg.msg_controllen = 0;
+	msg.msg_namelen = 0;
+	msg.msg_iocb = NULL;
+	msg.msg_flags = 0;
+
+	flags = req->sr_msg.msg_flags;
+	if (flags & MSG_DONTWAIT)
+		req->flags |= REQ_F_NOWAIT;
+	else if (force_nonblock)
+		flags |= MSG_DONTWAIT;
+
+	ret = sock_recvmsg(sock, &msg, flags);
+	if (force_nonblock && ret == -EAGAIN)
+		return -EAGAIN;
+	if (ret == -ERESTARTSYS)
+		ret = -EINTR;
+
 	kfree(kbuf);
 	req->flags &= ~REQ_F_NEED_CLEANUP;
 	if (ret < 0)

From 6b754c8b912a164fbb15b7b839d51709c3d9ee6f Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Thu, 16 Jul 2020 23:28:00 +0300
Subject: [PATCH 431/502] io_uring: remove extra checks in send/recv

With the return on a bad socket, kmsg is always non-null by the end
of the function, prune left extra checks and initialisations.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 264b1e5e2d54..ac3c16ea7d23 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -3916,7 +3916,7 @@ static int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 static int io_sendmsg(struct io_kiocb *req, bool force_nonblock,
 		      struct io_comp_state *cs)
 {
-	struct io_async_msghdr iomsg, *kmsg = NULL;
+	struct io_async_msghdr iomsg, *kmsg;
 	struct socket *sock;
 	unsigned flags;
 	int ret;
@@ -3951,7 +3951,7 @@ static int io_sendmsg(struct io_kiocb *req, bool force_nonblock,
 	if (ret == -ERESTARTSYS)
 		ret = -EINTR;
 
-	if (kmsg && kmsg->iov != kmsg->fast_iov)
+	if (kmsg->iov != kmsg->fast_iov)
 		kfree(kmsg->iov);
 	req->flags &= ~REQ_F_NEED_CLEANUP;
 	if (ret < 0)
@@ -4147,7 +4147,7 @@ static int io_recvmsg_prep(struct io_kiocb *req,
 static int io_recvmsg(struct io_kiocb *req, bool force_nonblock,
 		      struct io_comp_state *cs)
 {
-	struct io_async_msghdr iomsg, *kmsg = NULL;
+	struct io_async_msghdr iomsg, *kmsg;
 	struct socket *sock;
 	struct io_buffer *kbuf;
 	unsigned flags;
@@ -4199,7 +4199,7 @@ static int io_recvmsg(struct io_kiocb *req, bool force_nonblock,
 	if (kbuf)
 		kfree(kbuf);
 
-	if (kmsg && kmsg->iov != kmsg->fast_iov)
+	if (kmsg->iov != kmsg->fast_iov)
 		kfree(kmsg->iov);
 	req->flags &= ~REQ_F_NEED_CLEANUP;
 
@@ -4212,7 +4212,7 @@ static int io_recvmsg(struct io_kiocb *req, bool force_nonblock,
 static int io_recv(struct io_kiocb *req, bool force_nonblock,
 		   struct io_comp_state *cs)
 {
-	struct io_buffer *kbuf = NULL;
+	struct io_buffer *kbuf;
 	struct io_sr_msg *sr = &req->sr_msg;
 	struct msghdr msg;
 	void __user *buf = sr->buf;

From 14c32eee9286621dd437b53460e44bd11e5bc08d Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Thu, 16 Jul 2020 23:28:01 +0300
Subject: [PATCH 432/502] io_uring: don't forget cflags in io_recv()

Instead of returning error from io_recv(), go through generic cleanup
path, because it'll retain cflags for userspace. Do the same for
io_send() for consistency.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index ac3c16ea7d23..2ffacfbf9094 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -3976,7 +3976,7 @@ static int io_send(struct io_kiocb *req, bool force_nonblock,
 
 	ret = import_single_range(WRITE, sr->buf, sr->len, &iov, &msg.msg_iter);
 	if (unlikely(ret))
-		return ret;
+		return ret;;
 
 	msg.msg_name = NULL;
 	msg.msg_control = NULL;
@@ -4232,10 +4232,8 @@ static int io_recv(struct io_kiocb *req, bool force_nonblock,
 		buf = u64_to_user_ptr(kbuf->addr);
 
 	ret = import_single_range(READ, buf, sr->len, &iov, &msg.msg_iter);
-	if (unlikely(ret)) {
-		kfree(kbuf);
-		return ret;
-	}
+	if (unlikely(ret))
+		goto out_free;
 
 	req->flags |= REQ_F_NEED_CLEANUP;
 	msg.msg_name = NULL;
@@ -4256,7 +4254,7 @@ static int io_recv(struct io_kiocb *req, bool force_nonblock,
 		return -EAGAIN;
 	if (ret == -ERESTARTSYS)
 		ret = -EINTR;
-
+out_free:
 	kfree(kbuf);
 	req->flags &= ~REQ_F_NEED_CLEANUP;
 	if (ret < 0)

From 0e1b6fe3d1e5f1b79c5bec37881c98febfba7718 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Thu, 16 Jul 2020 23:28:02 +0300
Subject: [PATCH 433/502] io_uring: free selected-bufs if error'ed

io_clean_op() may be skipped even if there is a selected io_buffer,
that's because *select_buffer() funcions never set REQ_F_NEED_CLEANUP.

Trigger io_clean_op() when REQ_F_BUFFER_SELECTED is set as well, and
and clear the flag if was freed out of it.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 83 ++++++++++++++++++++++++++-------------------------
 1 file changed, 43 insertions(+), 40 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 2ffacfbf9094..4448b1e9a754 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -957,7 +957,7 @@ static void io_get_req_task(struct io_kiocb *req)
 
 static inline void io_clean_op(struct io_kiocb *req)
 {
-	if (req->flags & REQ_F_NEED_CLEANUP)
+	if (req->flags & (REQ_F_NEED_CLEANUP | REQ_F_BUFFER_SELECTED))
 		__io_clean_op(req);
 }
 
@@ -1931,6 +1931,7 @@ static int io_put_kbuf(struct io_kiocb *req)
 	cflags = kbuf->bid << IORING_CQE_BUFFER_SHIFT;
 	cflags |= IORING_CQE_F_BUFFER;
 	req->rw.addr = 0;
+	req->flags &= ~REQ_F_BUFFER_SELECTED;
 	kfree(kbuf);
 	return cflags;
 }
@@ -4188,20 +4189,16 @@ static int io_recvmsg(struct io_kiocb *req, bool force_nonblock,
 
 	ret = __sys_recvmsg_sock(sock, &kmsg->msg, req->sr_msg.umsg,
 					kmsg->uaddr, flags);
-	if (force_nonblock && ret == -EAGAIN) {
-		ret = io_setup_async_msg(req, kmsg);
-		if (ret != -EAGAIN)
-			kfree(kbuf);
-		return ret;
-	}
+	if (force_nonblock && ret == -EAGAIN)
+		return io_setup_async_msg(req, kmsg);
 	if (ret == -ERESTARTSYS)
 		ret = -EINTR;
+
 	if (kbuf)
 		kfree(kbuf);
-
 	if (kmsg->iov != kmsg->fast_iov)
 		kfree(kmsg->iov);
-	req->flags &= ~REQ_F_NEED_CLEANUP;
+	req->flags &= ~(REQ_F_NEED_CLEANUP | REQ_F_BUFFER_SELECTED);
 
 	if (ret < 0)
 		req_set_fail_links(req);
@@ -4235,7 +4232,6 @@ static int io_recv(struct io_kiocb *req, bool force_nonblock,
 	if (unlikely(ret))
 		goto out_free;
 
-	req->flags |= REQ_F_NEED_CLEANUP;
 	msg.msg_name = NULL;
 	msg.msg_control = NULL;
 	msg.msg_controllen = 0;
@@ -4255,7 +4251,8 @@ static int io_recv(struct io_kiocb *req, bool force_nonblock,
 	if (ret == -ERESTARTSYS)
 		ret = -EINTR;
 out_free:
-	kfree(kbuf);
+	if (kbuf)
+		kfree(kbuf);
 	req->flags &= ~REQ_F_NEED_CLEANUP;
 	if (ret < 0)
 		req_set_fail_links(req);
@@ -5436,39 +5433,45 @@ static void __io_clean_op(struct io_kiocb *req)
 {
 	struct io_async_ctx *io = req->io;
 
-	switch (req->opcode) {
-	case IORING_OP_READV:
-	case IORING_OP_READ_FIXED:
-	case IORING_OP_READ:
-		if (req->flags & REQ_F_BUFFER_SELECTED)
+	if (req->flags & REQ_F_BUFFER_SELECTED) {
+		switch (req->opcode) {
+		case IORING_OP_READV:
+		case IORING_OP_READ_FIXED:
+		case IORING_OP_READ:
 			kfree((void *)(unsigned long)req->rw.addr);
-		/* fallthrough */
-	case IORING_OP_WRITEV:
-	case IORING_OP_WRITE_FIXED:
-	case IORING_OP_WRITE:
-		if (io->rw.iov != io->rw.fast_iov)
-			kfree(io->rw.iov);
-		break;
-	case IORING_OP_RECVMSG:
-		if (req->flags & REQ_F_BUFFER_SELECTED)
+			break;
+		case IORING_OP_RECVMSG:
+		case IORING_OP_RECV:
 			kfree(req->sr_msg.kbuf);
-		/* fallthrough */
-	case IORING_OP_SENDMSG:
-		if (io->msg.iov != io->msg.fast_iov)
-			kfree(io->msg.iov);
-		break;
-	case IORING_OP_RECV:
-		if (req->flags & REQ_F_BUFFER_SELECTED)
-			kfree(req->sr_msg.kbuf);
-		break;
-	case IORING_OP_SPLICE:
-	case IORING_OP_TEE:
-		io_put_file(req, req->splice.file_in,
-			    (req->splice.flags & SPLICE_F_FD_IN_FIXED));
-		break;
+			break;
+		}
+		req->flags &= ~REQ_F_BUFFER_SELECTED;
 	}
 
-	req->flags &= ~REQ_F_NEED_CLEANUP;
+	if (req->flags & REQ_F_NEED_CLEANUP) {
+		switch (req->opcode) {
+		case IORING_OP_READV:
+		case IORING_OP_READ_FIXED:
+		case IORING_OP_READ:
+		case IORING_OP_WRITEV:
+		case IORING_OP_WRITE_FIXED:
+		case IORING_OP_WRITE:
+			if (io->rw.iov != io->rw.fast_iov)
+				kfree(io->rw.iov);
+			break;
+		case IORING_OP_RECVMSG:
+		case IORING_OP_SENDMSG:
+			if (io->msg.iov != io->msg.fast_iov)
+				kfree(io->msg.iov);
+			break;
+		case IORING_OP_SPLICE:
+		case IORING_OP_TEE:
+			io_put_file(req, req->splice.file_in,
+				    (req->splice.flags & SPLICE_F_FD_IN_FIXED));
+			break;
+		}
+		req->flags &= ~REQ_F_NEED_CLEANUP;
+	}
 }
 
 static int io_issue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe,

From bc02ef3325e3ef524ef29b65681ca4207b781224 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Thu, 16 Jul 2020 23:28:03 +0300
Subject: [PATCH 434/502] io_uring: move BUFFER_SELECT check into *recv[msg]

Move REQ_F_BUFFER_SELECT flag check out of io_recv_buffer_select(), and
do that in its call sites That saves us from double error checking and
possibly an extra function call.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 22 ++++++++++------------
 1 file changed, 10 insertions(+), 12 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 4448b1e9a754..8dd9037e332e 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -4098,9 +4098,6 @@ static struct io_buffer *io_recv_buffer_select(struct io_kiocb *req,
 	struct io_sr_msg *sr = &req->sr_msg;
 	struct io_buffer *kbuf;
 
-	if (!(req->flags & REQ_F_BUFFER_SELECT))
-		return NULL;
-
 	kbuf = io_buffer_select(req, &sr->len, sr->bgid, sr->kbuf, needs_lock);
 	if (IS_ERR(kbuf))
 		return kbuf;
@@ -4150,7 +4147,7 @@ static int io_recvmsg(struct io_kiocb *req, bool force_nonblock,
 {
 	struct io_async_msghdr iomsg, *kmsg;
 	struct socket *sock;
-	struct io_buffer *kbuf;
+	struct io_buffer *kbuf = NULL;
 	unsigned flags;
 	int ret, cflags = 0;
 
@@ -4172,10 +4169,10 @@ static int io_recvmsg(struct io_kiocb *req, bool force_nonblock,
 		kmsg = &iomsg;
 	}
 
-	kbuf = io_recv_buffer_select(req, &cflags, !force_nonblock);
-	if (IS_ERR(kbuf)) {
-		return PTR_ERR(kbuf);
-	} else if (kbuf) {
+	if (req->flags & REQ_F_BUFFER_SELECT) {
+		kbuf = io_recv_buffer_select(req, &cflags, !force_nonblock);
+		if (IS_ERR(kbuf))
+			return PTR_ERR(kbuf);
 		kmsg->fast_iov[0].iov_base = u64_to_user_ptr(kbuf->addr);
 		iov_iter_init(&kmsg->msg.msg_iter, READ, kmsg->iov,
 				1, req->sr_msg.len);
@@ -4222,11 +4219,12 @@ static int io_recv(struct io_kiocb *req, bool force_nonblock,
 	if (unlikely(!sock))
 		return ret;
 
-	kbuf = io_recv_buffer_select(req, &cflags, !force_nonblock);
-	if (IS_ERR(kbuf))
-		return PTR_ERR(kbuf);
-	else if (kbuf)
+	if (req->flags & REQ_F_BUFFER_SELECT) {
+		kbuf = io_recv_buffer_select(req, &cflags, !force_nonblock);
+		if (IS_ERR(kbuf))
+			return PTR_ERR(kbuf);
 		buf = u64_to_user_ptr(kbuf->addr);
+	}
 
 	ret = import_single_range(READ, buf, sr->len, &iov, &msg.msg_iter);
 	if (unlikely(ret))

From 8ff069bf2efd7b7aeb90b56ea8edc165c93d8940 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Thu, 16 Jul 2020 23:28:04 +0300
Subject: [PATCH 435/502] io_uring: extract io_put_kbuf() helper

Extract a common helper for cleaning up a selected buffer, this will be
used shortly. By the way, correct cflags types to unsigned and, as kbufs
are anyway tracked by a flag, remove useless zeroing req->rw.addr.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 19 ++++++++++++-------
 1 file changed, 12 insertions(+), 7 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 8dd9037e332e..871ada2a29c3 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -1922,20 +1922,25 @@ static inline unsigned int io_sqring_entries(struct io_ring_ctx *ctx)
 	return smp_load_acquire(&rings->sq.tail) - ctx->cached_sq_head;
 }
 
-static int io_put_kbuf(struct io_kiocb *req)
+static unsigned int io_put_kbuf(struct io_kiocb *req, struct io_buffer *kbuf)
 {
-	struct io_buffer *kbuf;
-	int cflags;
+	unsigned int cflags;
 
-	kbuf = (struct io_buffer *) (unsigned long) req->rw.addr;
 	cflags = kbuf->bid << IORING_CQE_BUFFER_SHIFT;
 	cflags |= IORING_CQE_F_BUFFER;
-	req->rw.addr = 0;
 	req->flags &= ~REQ_F_BUFFER_SELECTED;
 	kfree(kbuf);
 	return cflags;
 }
 
+static inline unsigned int io_put_rw_kbuf(struct io_kiocb *req)
+{
+	struct io_buffer *kbuf;
+
+	kbuf = (struct io_buffer *) (unsigned long) req->rw.addr;
+	return io_put_kbuf(req, kbuf);
+}
+
 static inline bool io_run_task_work(void)
 {
 	if (current->task_works) {
@@ -1985,7 +1990,7 @@ static void io_iopoll_complete(struct io_ring_ctx *ctx, unsigned int *nr_events,
 		list_del(&req->inflight_entry);
 
 		if (req->flags & REQ_F_BUFFER_SELECTED)
-			cflags = io_put_kbuf(req);
+			cflags = io_put_rw_kbuf(req);
 
 		__io_cqring_fill_event(req, req->result, cflags);
 		(*nr_events)++;
@@ -2177,7 +2182,7 @@ static void io_complete_rw_common(struct kiocb *kiocb, long res,
 	if (res != req->result)
 		req_set_fail_links(req);
 	if (req->flags & REQ_F_BUFFER_SELECTED)
-		cflags = io_put_kbuf(req);
+		cflags = io_put_rw_kbuf(req);
 	__io_req_complete(req, res, cflags, cs);
 }
 

From 7fbb1b541f4286cc337b9bca1e5bad0ce4ee978c Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Thu, 16 Jul 2020 23:28:05 +0300
Subject: [PATCH 436/502] io_uring: don't open-code recv kbuf managment

Don't implement fast path of kbuf freeing and management inlined into
io_recv{,msg}(), that's error prone and duplicates handling. Replace it
with a helper io_put_recv_kbuf(), which mimics io_put_rw_kbuf() in the
io_read/write().

This also keeps cflags calculation in one place, removing duplication
between rw and recv/send.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 28 ++++++++++++++--------------
 1 file changed, 14 insertions(+), 14 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 871ada2a29c3..6e5ea7991c08 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -4098,7 +4098,7 @@ static int io_recvmsg_copy_hdr(struct io_kiocb *req,
 }
 
 static struct io_buffer *io_recv_buffer_select(struct io_kiocb *req,
-					       int *cflags, bool needs_lock)
+					       bool needs_lock)
 {
 	struct io_sr_msg *sr = &req->sr_msg;
 	struct io_buffer *kbuf;
@@ -4109,12 +4109,14 @@ static struct io_buffer *io_recv_buffer_select(struct io_kiocb *req,
 
 	sr->kbuf = kbuf;
 	req->flags |= REQ_F_BUFFER_SELECTED;
-
-	*cflags = kbuf->bid << IORING_CQE_BUFFER_SHIFT;
-	*cflags |= IORING_CQE_F_BUFFER;
 	return kbuf;
 }
 
+static inline unsigned int io_put_recv_kbuf(struct io_kiocb *req)
+{
+	return io_put_kbuf(req, req->sr_msg.kbuf);
+}
+
 static int io_recvmsg_prep(struct io_kiocb *req,
 			   const struct io_uring_sqe *sqe)
 {
@@ -4152,7 +4154,7 @@ static int io_recvmsg(struct io_kiocb *req, bool force_nonblock,
 {
 	struct io_async_msghdr iomsg, *kmsg;
 	struct socket *sock;
-	struct io_buffer *kbuf = NULL;
+	struct io_buffer *kbuf;
 	unsigned flags;
 	int ret, cflags = 0;
 
@@ -4175,7 +4177,7 @@ static int io_recvmsg(struct io_kiocb *req, bool force_nonblock,
 	}
 
 	if (req->flags & REQ_F_BUFFER_SELECT) {
-		kbuf = io_recv_buffer_select(req, &cflags, !force_nonblock);
+		kbuf = io_recv_buffer_select(req, !force_nonblock);
 		if (IS_ERR(kbuf))
 			return PTR_ERR(kbuf);
 		kmsg->fast_iov[0].iov_base = u64_to_user_ptr(kbuf->addr);
@@ -4196,12 +4198,11 @@ static int io_recvmsg(struct io_kiocb *req, bool force_nonblock,
 	if (ret == -ERESTARTSYS)
 		ret = -EINTR;
 
-	if (kbuf)
-		kfree(kbuf);
+	if (req->flags & REQ_F_BUFFER_SELECTED)
+		cflags = io_put_recv_kbuf(req);
 	if (kmsg->iov != kmsg->fast_iov)
 		kfree(kmsg->iov);
-	req->flags &= ~(REQ_F_NEED_CLEANUP | REQ_F_BUFFER_SELECTED);
-
+	req->flags &= ~REQ_F_NEED_CLEANUP;
 	if (ret < 0)
 		req_set_fail_links(req);
 	__io_req_complete(req, ret, cflags, cs);
@@ -4225,7 +4226,7 @@ static int io_recv(struct io_kiocb *req, bool force_nonblock,
 		return ret;
 
 	if (req->flags & REQ_F_BUFFER_SELECT) {
-		kbuf = io_recv_buffer_select(req, &cflags, !force_nonblock);
+		kbuf = io_recv_buffer_select(req, !force_nonblock);
 		if (IS_ERR(kbuf))
 			return PTR_ERR(kbuf);
 		buf = u64_to_user_ptr(kbuf->addr);
@@ -4254,9 +4255,8 @@ static int io_recv(struct io_kiocb *req, bool force_nonblock,
 	if (ret == -ERESTARTSYS)
 		ret = -EINTR;
 out_free:
-	if (kbuf)
-		kfree(kbuf);
-	req->flags &= ~REQ_F_NEED_CLEANUP;
+	if (req->flags & REQ_F_BUFFER_SELECTED)
+		cflags = io_put_recv_kbuf(req);
 	if (ret < 0)
 		req_set_fail_links(req);
 	__io_req_complete(req, ret, cflags, cs);

From 5dbcad51f78434e782d0470b8b5fc4380700c35f Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Sat, 18 Jul 2020 11:31:20 +0300
Subject: [PATCH 437/502] io_uring: don't miscount pinned memory

io_sqe_buffer_unregister() uses cxt->sqo_mm for memory accounting, but
io_ring_ctx_free() drops ->sqo_mm before leaving pinned_vm
over-accounted. Postpone mm cleanup for when it's not needed anymore.

Fixes: 309758254ea62 ("io_uring: report pinned memory usage")
Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 6e5ea7991c08..ba7ce103667b 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -7670,12 +7670,12 @@ static void io_destroy_buffers(struct io_ring_ctx *ctx)
 static void io_ring_ctx_free(struct io_ring_ctx *ctx)
 {
 	io_finish_async(ctx);
+	io_sqe_buffer_unregister(ctx);
 	if (ctx->sqo_mm) {
 		mmdrop(ctx->sqo_mm);
 		ctx->sqo_mm = NULL;
 	}
 
-	io_sqe_buffer_unregister(ctx);
 	io_sqe_files_unregister(ctx);
 	io_eventfd_unregister(ctx);
 	io_destroy_buffers(ctx);

From cbcf72148da4af55ea81cfb351ea7c026ff1014f Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Sat, 18 Jul 2020 11:31:21 +0300
Subject: [PATCH 438/502] io_uring: return locked and pinned page accounting

Locked and pinned memory accounting in io_{,un}account_mem() depends on
having ->sqo_mm, which is NULL after a recent change for non SQPOLL'ed
io_ring. That disables the accounting.

Return ->sqo_mm initialisation back, and do __io_sq_thread_acquire_mm()
based on IORING_SETUP_SQPOLL flag.

Fixes: 8eb06d7e8dd85 ("io_uring: fix missing ->mm on exit")
Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index ba7ce103667b..680b16f71a03 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -981,7 +981,8 @@ static void io_sq_thread_drop_mm(void)
 static int __io_sq_thread_acquire_mm(struct io_ring_ctx *ctx)
 {
 	if (!current->mm) {
-		if (unlikely(!ctx->sqo_mm || !mmget_not_zero(ctx->sqo_mm)))
+		if (unlikely(!(ctx->flags & IORING_SETUP_SQPOLL) ||
+			     !mmget_not_zero(ctx->sqo_mm)))
 			return -EFAULT;
 		kthread_use_mm(ctx->sqo_mm);
 	}
@@ -7259,10 +7260,10 @@ static int io_sq_offload_start(struct io_ring_ctx *ctx,
 {
 	int ret;
 
-	if (ctx->flags & IORING_SETUP_SQPOLL) {
-		mmgrab(current->mm);
-		ctx->sqo_mm = current->mm;
+	mmgrab(current->mm);
+	ctx->sqo_mm = current->mm;
 
+	if (ctx->flags & IORING_SETUP_SQPOLL) {
 		ret = -EPERM;
 		if (!capable(CAP_SYS_ADMIN))
 			goto err;

From dd6f843a9fca8f225c86fee5f50da429c369c045 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Sat, 18 Jul 2020 11:32:51 +0300
Subject: [PATCH 439/502] tasks: add put_task_struct_many()

put_task_struct_many() is as put_task_struct() but puts several
references at once. Useful to batching it.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/sched/task.h | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/include/linux/sched/task.h b/include/linux/sched/task.h
index 38359071236a..1301077f9c24 100644
--- a/include/linux/sched/task.h
+++ b/include/linux/sched/task.h
@@ -126,6 +126,12 @@ static inline void put_task_struct(struct task_struct *t)
 		__put_task_struct(t);
 }
 
+static inline void put_task_struct_many(struct task_struct *t, int nr)
+{
+	if (refcount_sub_and_test(nr, &t->usage))
+		__put_task_struct(t);
+}
+
 void put_task_struct_rcu_user(struct task_struct *task);
 
 #ifdef CONFIG_ARCH_WANTS_DYNAMIC_TASK_STRUCT

From 5af1d13e8f0d8839db04a71ec786f369b0e67234 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Sat, 18 Jul 2020 11:32:52 +0300
Subject: [PATCH 440/502] io_uring: batch put_task_struct()

As every iopoll request have a task ref, it becomes expensive to put
them one by one, instead we can put several at once integrating that
into io_req_free_batch().

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 29 +++++++++++++++++++++++++++--
 1 file changed, 27 insertions(+), 2 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 680b16f71a03..3a415d924b93 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -1544,7 +1544,6 @@ static void io_dismantle_req(struct io_kiocb *req)
 		kfree(req->io);
 	if (req->file)
 		io_put_file(req, req->file, (req->flags & REQ_F_FIXED_FILE));
-	__io_put_req_task(req);
 	io_req_clean_work(req);
 
 	if (req->flags & REQ_F_INFLIGHT) {
@@ -1564,6 +1563,7 @@ static void __io_free_req(struct io_kiocb *req)
 	struct io_ring_ctx *ctx;
 
 	io_dismantle_req(req);
+	__io_put_req_task(req);
 	ctx = req->ctx;
 	if (likely(!io_is_fallback_req(req)))
 		kmem_cache_free(req_cachep, req);
@@ -1807,8 +1807,18 @@ static void io_free_req(struct io_kiocb *req)
 struct req_batch {
 	void *reqs[IO_IOPOLL_BATCH];
 	int to_free;
+
+	struct task_struct	*task;
+	int			task_refs;
 };
 
+static inline void io_init_req_batch(struct req_batch *rb)
+{
+	rb->to_free = 0;
+	rb->task_refs = 0;
+	rb->task = NULL;
+}
+
 static void __io_req_free_batch_flush(struct io_ring_ctx *ctx,
 				      struct req_batch *rb)
 {
@@ -1822,6 +1832,10 @@ static void io_req_free_batch_finish(struct io_ring_ctx *ctx,
 {
 	if (rb->to_free)
 		__io_req_free_batch_flush(ctx, rb);
+	if (rb->task) {
+		put_task_struct_many(rb->task, rb->task_refs);
+		rb->task = NULL;
+	}
 }
 
 static void io_req_free_batch(struct req_batch *rb, struct io_kiocb *req)
@@ -1833,6 +1847,17 @@ static void io_req_free_batch(struct req_batch *rb, struct io_kiocb *req)
 	if (req->flags & REQ_F_LINK_HEAD)
 		io_queue_next(req);
 
+	if (req->flags & REQ_F_TASK_PINNED) {
+		if (req->task != rb->task) {
+			if (rb->task)
+				put_task_struct_many(rb->task, rb->task_refs);
+			rb->task = req->task;
+			rb->task_refs = 0;
+		}
+		rb->task_refs++;
+		req->flags &= ~REQ_F_TASK_PINNED;
+	}
+
 	io_dismantle_req(req);
 	rb->reqs[rb->to_free++] = req;
 	if (unlikely(rb->to_free == ARRAY_SIZE(rb->reqs)))
@@ -1978,7 +2003,7 @@ static void io_iopoll_complete(struct io_ring_ctx *ctx, unsigned int *nr_events,
 	/* order with ->result store in io_complete_rw_iopoll() */
 	smp_rmb();
 
-	rb.to_free = 0;
+	io_init_req_batch(&rb);
 	while (!list_empty(done)) {
 		int cflags = 0;
 

From 23b3628e45924419399da48c2b3a522b05557c91 Mon Sep 17 00:00:00 2001
From: Xiaoguang Wang <xiaoguang.wang@linux.alibaba.com>
Date: Thu, 23 Jul 2020 20:57:24 +0800
Subject: [PATCH 441/502] io_uring: clear IORING_SQ_NEED_WAKEUP after executing
 task works

In io_sq_thread(), if there are task works to handle, current codes
will skip schedule() and go on polling sq again, but forget to clear
IORING_SQ_NEED_WAKEUP flag, fix this issue. Also add two helpers to
set and clear IORING_SQ_NEED_WAKEUP flag,

Signed-off-by: Xiaoguang Wang <xiaoguang.wang@linux.alibaba.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 29 +++++++++++++++++++----------
 1 file changed, 19 insertions(+), 10 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 3a415d924b93..6f3f18a99f4f 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -6344,6 +6344,21 @@ fail_req:
 	return submitted;
 }
 
+static inline void io_ring_set_wakeup_flag(struct io_ring_ctx *ctx)
+{
+	/* Tell userspace we may need a wakeup call */
+	spin_lock_irq(&ctx->completion_lock);
+	ctx->rings->sq_flags |= IORING_SQ_NEED_WAKEUP;
+	spin_unlock_irq(&ctx->completion_lock);
+}
+
+static inline void io_ring_clear_wakeup_flag(struct io_ring_ctx *ctx)
+{
+	spin_lock_irq(&ctx->completion_lock);
+	ctx->rings->sq_flags &= ~IORING_SQ_NEED_WAKEUP;
+	spin_unlock_irq(&ctx->completion_lock);
+}
+
 static int io_sq_thread(void *data)
 {
 	struct io_ring_ctx *ctx = data;
@@ -6417,10 +6432,7 @@ static int io_sq_thread(void *data)
 				continue;
 			}
 
-			/* Tell userspace we may need a wakeup call */
-			spin_lock_irq(&ctx->completion_lock);
-			ctx->rings->sq_flags |= IORING_SQ_NEED_WAKEUP;
-			spin_unlock_irq(&ctx->completion_lock);
+			io_ring_set_wakeup_flag(ctx);
 
 			to_submit = io_sqring_entries(ctx);
 			if (!to_submit || ret == -EBUSY) {
@@ -6430,6 +6442,7 @@ static int io_sq_thread(void *data)
 				}
 				if (io_run_task_work()) {
 					finish_wait(&ctx->sqo_wait, &wait);
+					io_ring_clear_wakeup_flag(ctx);
 					continue;
 				}
 				if (signal_pending(current))
@@ -6437,17 +6450,13 @@ static int io_sq_thread(void *data)
 				schedule();
 				finish_wait(&ctx->sqo_wait, &wait);
 
-				spin_lock_irq(&ctx->completion_lock);
-				ctx->rings->sq_flags &= ~IORING_SQ_NEED_WAKEUP;
-				spin_unlock_irq(&ctx->completion_lock);
+				io_ring_clear_wakeup_flag(ctx);
 				ret = 0;
 				continue;
 			}
 			finish_wait(&ctx->sqo_wait, &wait);
 
-			spin_lock_irq(&ctx->completion_lock);
-			ctx->rings->sq_flags &= ~IORING_SQ_NEED_WAKEUP;
-			spin_unlock_irq(&ctx->completion_lock);
+			io_ring_clear_wakeup_flag(ctx);
 		}
 
 		mutex_lock(&ctx->uring_lock);

From ae34817bd93e373a03203a4c6892735c430a14e1 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Thu, 23 Jul 2020 20:25:20 +0300
Subject: [PATCH 442/502] io_uring: don't do opcode prep twice

Calling into opcode prep handlers may be dangerous, as they re-read
SQE but might not re-initialise requests completely. If io_req_defer()
passed fast checks and is done with preparations, punt it async.

As all other cases are covered with nulling @sqe, this guarantees that
io_[opcode]_prep() are visited only once per request.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 6f3f18a99f4f..38e4c3902963 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -5447,7 +5447,8 @@ static int io_req_defer(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 	if (!req_need_defer(req, seq) && list_empty(&ctx->defer_list)) {
 		spin_unlock_irq(&ctx->completion_lock);
 		kfree(de);
-		return 0;
+		io_queue_async_work(req);
+		return -EIOCBQUEUED;
 	}
 
 	trace_io_uring_defer(ctx, req, req->user_data);

From f56040b81999871973d21f334b4657957422c90e Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Thu, 23 Jul 2020 20:25:21 +0300
Subject: [PATCH 443/502] io_uring: deduplicate io_grab_files() calls

Move io_req_init_async() into io_grab_files(), it's safer this way. Note
that io_queue_async_work() does *init_async(), so it's valid to move out
of __io_queue_sqe() punt path. Also, add a helper around io_grab_files().

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 32 ++++++++++++++++----------------
 1 file changed, 16 insertions(+), 16 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 38e4c3902963..c7e8e9a1b27b 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -912,7 +912,7 @@ static void io_queue_linked_timeout(struct io_kiocb *req);
 static int __io_sqe_files_update(struct io_ring_ctx *ctx,
 				 struct io_uring_files_update *ip,
 				 unsigned nr_args);
-static int io_grab_files(struct io_kiocb *req);
+static int io_prep_work_files(struct io_kiocb *req);
 static void io_complete_rw_common(struct kiocb *kiocb, long res,
 				  struct io_comp_state *cs);
 static void __io_clean_op(struct io_kiocb *req);
@@ -5294,13 +5294,9 @@ static int io_req_defer_prep(struct io_kiocb *req,
 
 	if (io_alloc_async_ctx(req))
 		return -EAGAIN;
-
-	if (io_op_defs[req->opcode].file_table) {
-		io_req_init_async(req);
-		ret = io_grab_files(req);
-		if (unlikely(ret))
-			return ret;
-	}
+	ret = io_prep_work_files(req);
+	if (unlikely(ret))
+		return ret;
 
 	switch (req->opcode) {
 	case IORING_OP_NOP:
@@ -5851,6 +5847,8 @@ static int io_grab_files(struct io_kiocb *req)
 	int ret = -EBADF;
 	struct io_ring_ctx *ctx = req->ctx;
 
+	io_req_init_async(req);
+
 	if (req->work.files || (req->flags & REQ_F_NO_FILE_TABLE))
 		return 0;
 	if (!ctx->ring_file)
@@ -5876,6 +5874,13 @@ static int io_grab_files(struct io_kiocb *req)
 	return ret;
 }
 
+static inline int io_prep_work_files(struct io_kiocb *req)
+{
+	if (!io_op_defs[req->opcode].file_table)
+		return 0;
+	return io_grab_files(req);
+}
+
 static enum hrtimer_restart io_link_timeout_fn(struct hrtimer *timer)
 {
 	struct io_timeout_data *data = container_of(timer,
@@ -5987,14 +5992,9 @@ again:
 			goto exit;
 		}
 punt:
-		io_req_init_async(req);
-
-		if (io_op_defs[req->opcode].file_table) {
-			ret = io_grab_files(req);
-			if (ret)
-				goto err;
-		}
-
+		ret = io_prep_work_files(req);
+		if (unlikely(ret))
+			goto err;
 		/*
 		 * Queued up for async execution, worker will release
 		 * submit reference when the iocb is actually submitted.

From b65e0dd6a2de050d3fc4c0db4969a245f4e7273e Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Sat, 25 Jul 2020 14:41:58 +0300
Subject: [PATCH 444/502] io_uring: mark ->work uninitialised after cleanup

Remove REQ_F_WORK_INITIALIZED after io_req_clean_work(). That's a cold
path but is safer for those using io_req_clean_work() out of
*dismantle_req()/*io_free(). And for the same reason zero work.fs

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index c7e8e9a1b27b..59f1f473ffc7 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -1141,7 +1141,9 @@ static void io_req_clean_work(struct io_kiocb *req)
 		spin_unlock(&req->work.fs->lock);
 		if (fs)
 			free_fs_struct(fs);
+		req->work.fs = NULL;
 	}
+	req->flags &= ~REQ_F_WORK_INITIALIZED;
 }
 
 static void io_prep_async_work(struct io_kiocb *req)
@@ -4969,7 +4971,6 @@ static int io_poll_add(struct io_kiocb *req)
 
 	/* ->work is in union with hash_node and others */
 	io_req_clean_work(req);
-	req->flags &= ~REQ_F_WORK_INITIALIZED;
 
 	INIT_HLIST_NODE(&req->hash_node);
 	ipt.pt._qproc = io_poll_queue_proc;

From f063c5477eb392c315aa25ad538b4920b367ea05 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Sat, 25 Jul 2020 14:41:59 +0300
Subject: [PATCH 445/502] io_uring: fix missing io_queue_linked_timeout()

Whoever called io_prep_linked_timeout() should also do
io_queue_linked_timeout(). __io_queue_sqe() doesn't follow that for the
punting path leaving linked timeouts prepared but never queued.

Fixes: 6df1db6b54243 ("io_uring: fix mis-refcounting linked timeouts")
Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 26 +++++++++++++-------------
 1 file changed, 13 insertions(+), 13 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 59f1f473ffc7..3e406bc1f855 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -5987,20 +5987,20 @@ again:
 	 * doesn't support non-blocking read/write attempts
 	 */
 	if (ret == -EAGAIN && !(req->flags & REQ_F_NOWAIT)) {
-		if (io_arm_poll_handler(req)) {
-			if (linked_timeout)
-				io_queue_linked_timeout(linked_timeout);
-			goto exit;
-		}
+		if (!io_arm_poll_handler(req)) {
 punt:
-		ret = io_prep_work_files(req);
-		if (unlikely(ret))
-			goto err;
-		/*
-		 * Queued up for async execution, worker will release
-		 * submit reference when the iocb is actually submitted.
-		 */
-		io_queue_async_work(req);
+			ret = io_prep_work_files(req);
+			if (unlikely(ret))
+				goto err;
+			/*
+			 * Queued up for async execution, worker will release
+			 * submit reference when the iocb is actually submitted.
+			 */
+			io_queue_async_work(req);
+		}
+
+		if (linked_timeout)
+			io_queue_linked_timeout(linked_timeout);
 		goto exit;
 	}
 

From b089ed390b5c9bc248a32168709cfa01099caf9d Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Sat, 25 Jul 2020 14:42:00 +0300
Subject: [PATCH 446/502] io-wq: update hash bits

Linked requests are hashed, remove a comment stating otherwise. Also
move hash bits to emphasise that we don't carry it through loop
iteration and set it every time.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io-wq.c | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/fs/io-wq.c b/fs/io-wq.c
index 8702d3c3b291..e92c4724480c 100644
--- a/fs/io-wq.c
+++ b/fs/io-wq.c
@@ -490,7 +490,6 @@ static void io_worker_handle_work(struct io_worker *worker)
 
 	do {
 		struct io_wq_work *work;
-		unsigned int hash;
 get_next:
 		/*
 		 * If we got some work, mark us as busy. If we didn't, but
@@ -513,6 +512,7 @@ get_next:
 		/* handle a whole dependent link */
 		do {
 			struct io_wq_work *old_work, *next_hashed, *linked;
+			unsigned int hash = io_get_work_hash(work);
 
 			next_hashed = wq_next_work(work);
 			io_impersonate_work(worker, work);
@@ -523,7 +523,6 @@ get_next:
 			if (test_bit(IO_WQ_BIT_CANCEL, &wq->state))
 				work->flags |= IO_WQ_WORK_CANCEL;
 
-			hash = io_get_work_hash(work);
 			old_work = work;
 			linked = wq->do_work(work);
 
@@ -542,8 +541,6 @@ get_next:
 				spin_lock_irq(&wqe->lock);
 				wqe->hash_map &= ~BIT_ULL(hash);
 				wqe->flags &= ~IO_WQE_FLAG_STALLED;
-				/* dependent work is not hashed */
-				hash = -1U;
 				/* skip unnecessary unlock-lock wqe->lock */
 				if (!work)
 					goto get_next;

From 4631f3ca493a7c8f9f31aef45fc0fc0e182155b7 Mon Sep 17 00:00:00 2001
From: Niklas Schnelle <schnelle@linux.ibm.com>
Date: Tue, 7 Jul 2020 16:42:19 +0200
Subject: [PATCH 447/502] s390/pci: clarify comment in s390_mmio_read/write

The existing comment was talking about reading in the write part
and vice versa. While we are here make it more clear why restricting
the syscalls to MIO capable devices is okay.

Signed-off-by: Niklas Schnelle <schnelle@linux.ibm.com>
Signed-off-by: Heiko Carstens <hca@linux.ibm.com>
---
 arch/s390/pci/pci_mmio.c | 20 ++++++++++++--------
 1 file changed, 12 insertions(+), 8 deletions(-)

diff --git a/arch/s390/pci/pci_mmio.c b/arch/s390/pci/pci_mmio.c
index 38efa3e852c4..401cf670a243 100644
--- a/arch/s390/pci/pci_mmio.c
+++ b/arch/s390/pci/pci_mmio.c
@@ -155,10 +155,12 @@ SYSCALL_DEFINE3(s390_pci_mmio_write, unsigned long, mmio_addr,
 		return -EINVAL;
 
 	/*
-	 * Only support read access to MIO capable devices on a MIO enabled
-	 * system. Otherwise we would have to check for every address if it is
-	 * a special ZPCI_ADDR and we would have to do a get_pfn() which we
-	 * don't need for MIO capable devices.
+	 * We only support write access to MIO capable devices if we are on
+	 * a MIO enabled system. Otherwise we would have to check for every
+	 * address if it is a special ZPCI_ADDR and would have to do
+	 * a get_pfn() which we don't need for MIO capable devices.  Currently
+	 * ISM devices are the only devices without MIO support and there is no
+	 * known need for accessing these from userspace.
 	 */
 	if (static_branch_likely(&have_mio)) {
 		ret = __memcpy_toio_inuser((void  __iomem *) mmio_addr,
@@ -282,10 +284,12 @@ SYSCALL_DEFINE3(s390_pci_mmio_read, unsigned long, mmio_addr,
 		return -EINVAL;
 
 	/*
-	 * Only support write access to MIO capable devices on a MIO enabled
-	 * system. Otherwise we would have to check for every address if it is
-	 * a special ZPCI_ADDR and we would have to do a get_pfn() which we
-	 * don't need for MIO capable devices.
+	 * We only support read access to MIO capable devices if we are on
+	 * a MIO enabled system. Otherwise we would have to check for every
+	 * address if it is a special ZPCI_ADDR and would have to do
+	 * a get_pfn() which we don't need for MIO capable devices.  Currently
+	 * ISM devices are the only devices without MIO support and there is no
+	 * known need for accessing these from userspace.
 	 */
 	if (static_branch_likely(&have_mio)) {
 		ret = __memcpy_fromio_inuser(

From 73d6eb48d26930f0cbdc8bf1ccb0ad964e7d2b90 Mon Sep 17 00:00:00 2001
From: Ilya Leoshkevich <iii@linux.ibm.com>
Date: Wed, 22 Jul 2020 23:58:54 +0200
Subject: [PATCH 448/502] s390: enable HAVE_FUNCTION_ERROR_INJECTION

This kernel feature is required for enabling BPF_KPROBE_OVERRIDE.

Define override_function_with_return() and regs_set_return_value()
functions, and fix compile errors in syscall_wrapper.h.

Signed-off-by: Ilya Leoshkevich <iii@linux.ibm.com>
Signed-off-by: Heiko Carstens <hca@linux.ibm.com>
---
 arch/s390/Kconfig                       |  1 +
 arch/s390/include/asm/ptrace.h          |  5 +++++
 arch/s390/include/asm/syscall_wrapper.h |  6 +++---
 arch/s390/lib/Makefile                  |  2 ++
 arch/s390/lib/error-inject.c            | 14 ++++++++++++++
 5 files changed, 25 insertions(+), 3 deletions(-)
 create mode 100644 arch/s390/lib/error-inject.c

diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig
index d95d323cf213..9cfd8de907cb 100644
--- a/arch/s390/Kconfig
+++ b/arch/s390/Kconfig
@@ -145,6 +145,7 @@ config S390
 	select HAVE_EFFICIENT_UNALIGNED_ACCESS
 	select HAVE_FENTRY
 	select HAVE_FTRACE_MCOUNT_RECORD
+	select HAVE_FUNCTION_ERROR_INJECTION
 	select HAVE_FUNCTION_GRAPH_TRACER
 	select HAVE_FUNCTION_TRACER
 	select HAVE_FUTEX_CMPXCHG if FUTEX
diff --git a/arch/s390/include/asm/ptrace.h b/arch/s390/include/asm/ptrace.h
index f009a13afe71..16b3e4396312 100644
--- a/arch/s390/include/asm/ptrace.h
+++ b/arch/s390/include/asm/ptrace.h
@@ -184,5 +184,10 @@ static inline unsigned long kernel_stack_pointer(struct pt_regs *regs)
 	return regs->gprs[15];
 }
 
+static inline void regs_set_return_value(struct pt_regs *regs, unsigned long rc)
+{
+	regs->gprs[2] = rc;
+}
+
 #endif /* __ASSEMBLY__ */
 #endif /* _S390_PTRACE_H */
diff --git a/arch/s390/include/asm/syscall_wrapper.h b/arch/s390/include/asm/syscall_wrapper.h
index 3c3d6fe8e2f0..1320f4213d80 100644
--- a/arch/s390/include/asm/syscall_wrapper.h
+++ b/arch/s390/include/asm/syscall_wrapper.h
@@ -30,7 +30,7 @@
 })
 
 #define __S390_SYS_STUBx(x, name, ...)					\
-	asmlinkage long __s390_sys##name(__MAP(x,__SC_LONG,__VA_ARGS__))\
+	asmlinkage long __s390_sys##name(__MAP(x,__SC_LONG,__VA_ARGS__));\
 	ALLOW_ERROR_INJECTION(__s390_sys##name, ERRNO);			\
 	asmlinkage long __s390_sys##name(__MAP(x,__SC_LONG,__VA_ARGS__))\
 	{								\
@@ -46,7 +46,7 @@
 #define COMPAT_SYSCALL_DEFINE0(sname)					\
 	SYSCALL_METADATA(_##sname, 0);					\
 	asmlinkage long __s390_compat_sys_##sname(void);		\
-	ALLOW_ERROR_INJECTION(__s390_compat__sys_##sname, ERRNO);	\
+	ALLOW_ERROR_INJECTION(__s390_compat_sys_##sname, ERRNO);	\
 	asmlinkage long __s390_compat_sys_##sname(void)
 
 #define SYSCALL_DEFINE0(sname)						\
@@ -72,7 +72,7 @@
 	asmlinkage long __s390_compat_sys##name(__MAP(x,__SC_DECL,__VA_ARGS__));	\
 	asmlinkage long __s390_compat_sys##name(__MAP(x,__SC_DECL,__VA_ARGS__))	\
 		__attribute__((alias(__stringify(__se_compat_sys##name))));	\
-	ALLOW_ERROR_INJECTION(compat_sys##name, ERRNO);				\
+	ALLOW_ERROR_INJECTION(__s390_compat_sys##name, ERRNO);			\
 	static inline long __do_compat_sys##name(__MAP(x,__SC_DECL,__VA_ARGS__));\
 	asmlinkage long __se_compat_sys##name(__MAP(x,__SC_LONG,__VA_ARGS__));	\
 	asmlinkage long __se_compat_sys##name(__MAP(x,__SC_LONG,__VA_ARGS__))	\
diff --git a/arch/s390/lib/Makefile b/arch/s390/lib/Makefile
index 28fd66d558ff..678333936f78 100644
--- a/arch/s390/lib/Makefile
+++ b/arch/s390/lib/Makefile
@@ -14,3 +14,5 @@ KASAN_SANITIZE_uaccess.o := n
 
 obj-$(CONFIG_S390_UNWIND_SELFTEST) += test_unwind.o
 CFLAGS_test_unwind.o += -fno-optimize-sibling-calls
+
+lib-$(CONFIG_FUNCTION_ERROR_INJECTION) += error-inject.o
diff --git a/arch/s390/lib/error-inject.c b/arch/s390/lib/error-inject.c
new file mode 100644
index 000000000000..8c9d4da87eef
--- /dev/null
+++ b/arch/s390/lib/error-inject.c
@@ -0,0 +1,14 @@
+// SPDX-License-Identifier: GPL-2.0+
+#include <asm/ptrace.h>
+#include <linux/error-injection.h>
+#include <linux/kprobes.h>
+
+void override_function_with_return(struct pt_regs *regs)
+{
+	/*
+	 * Emulate 'br 14'. 'regs' is captured by kprobes on entry to some
+	 * kernel function.
+	 */
+	regs->psw.addr = regs->gprs[14];
+}
+NOKPROBE_SYMBOL(override_function_with_return);

From 8398b226b8f01df902450658a139ee01d9f4c482 Mon Sep 17 00:00:00 2001
From: David Hildenbrand <david@redhat.com>
Date: Wed, 22 Jul 2020 11:45:50 +0200
Subject: [PATCH 449/502] s390/vmem: rename vmem_add_mem() to vmem_add_range()

Let's match the name to vmem_remove_range().

Cc: Vasily Gorbik <gor@linux.ibm.com>
Cc: Christian Borntraeger <borntraeger@de.ibm.com>
Cc: Gerald Schaefer <gerald.schaefer@de.ibm.com>
Signed-off-by: David Hildenbrand <david@redhat.com>
Message-Id: <20200722094558.9828-2-david@redhat.com>
Signed-off-by: Heiko Carstens <hca@linux.ibm.com>
---
 arch/s390/mm/vmem.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/arch/s390/mm/vmem.c b/arch/s390/mm/vmem.c
index 3b9e71654c37..66c5333020ea 100644
--- a/arch/s390/mm/vmem.c
+++ b/arch/s390/mm/vmem.c
@@ -57,7 +57,7 @@ pte_t __ref *vmem_pte_alloc(void)
 /*
  * Add a physical memory range to the 1:1 mapping.
  */
-static int vmem_add_mem(unsigned long start, unsigned long size)
+static int vmem_add_range(unsigned long start, unsigned long size)
 {
 	unsigned long pgt_prot, sgt_prot, r3_prot;
 	unsigned long pages4k, pages1m, pages2g;
@@ -308,7 +308,7 @@ int vmem_add_mapping(unsigned long start, unsigned long size)
 		return -ERANGE;
 
 	mutex_lock(&vmem_mutex);
-	ret = vmem_add_mem(start, size);
+	ret = vmem_add_range(start, size);
 	if (ret)
 		vmem_remove_range(start, size);
 	mutex_unlock(&vmem_mutex);
@@ -325,7 +325,7 @@ void __init vmem_map_init(void)
 	struct memblock_region *reg;
 
 	for_each_memblock(memory, reg)
-		vmem_add_mem(reg->base, reg->size);
+		vmem_add_range(reg->base, reg->size);
 	__set_memory((unsigned long)_stext,
 		     (unsigned long)(_etext - _stext) >> PAGE_SHIFT,
 		     SET_MEMORY_RO | SET_MEMORY_X);

From 3e0d3e408e63839625b210e5eb7269c45b870a38 Mon Sep 17 00:00:00 2001
From: David Hildenbrand <david@redhat.com>
Date: Wed, 22 Jul 2020 11:45:51 +0200
Subject: [PATCH 450/502] s390/vmem: consolidate vmem_add_range() and
 vmem_remove_range()

We want to have only a single pagetable walker and reuse the same
functionality for vmemmap handling. Let's start by consolidating
vmem_add_range() and vmem_remove_range(), converting it into a
recursive implementation.

A recursive implementation makes it easier to expand individual cases
without harming readability. In addition, we minimize traversing the
whole hierarchy over and over again.

One change is that we don't unmap large PMDs/PUDs when not completely
covered by the request, something that should never happen with direct
mappings, unless one would be removing in other granularity than added,
which would be broken already.

Cc: Vasily Gorbik <gor@linux.ibm.com>
Cc: Christian Borntraeger <borntraeger@de.ibm.com>
Cc: Gerald Schaefer <gerald.schaefer@de.ibm.com>
Signed-off-by: David Hildenbrand <david@redhat.com>
Message-Id: <20200722094558.9828-3-david@redhat.com>
Signed-off-by: Heiko Carstens <hca@linux.ibm.com>
---
 arch/s390/mm/vmem.c | 337 +++++++++++++++++++++++++++-----------------
 1 file changed, 208 insertions(+), 129 deletions(-)

diff --git a/arch/s390/mm/vmem.c b/arch/s390/mm/vmem.c
index 66c5333020ea..177daf389d39 100644
--- a/arch/s390/mm/vmem.c
+++ b/arch/s390/mm/vmem.c
@@ -54,88 +54,218 @@ pte_t __ref *vmem_pte_alloc(void)
 	return pte;
 }
 
+static void modify_pte_table(pmd_t *pmd, unsigned long addr, unsigned long end,
+			    bool add)
+{
+	unsigned long prot, pages = 0;
+	pte_t *pte;
+
+	prot = pgprot_val(PAGE_KERNEL);
+	if (!MACHINE_HAS_NX)
+		prot &= ~_PAGE_NOEXEC;
+
+	pte = pte_offset_kernel(pmd, addr);
+	for (; addr < end; addr += PAGE_SIZE, pte++) {
+		if (!add) {
+			if (pte_none(*pte))
+				continue;
+			pte_clear(&init_mm, addr, pte);
+		} else if (pte_none(*pte)) {
+			pte_val(*pte) = addr | prot;
+		} else
+			continue;
+
+		pages++;
+	}
+
+	update_page_count(PG_DIRECT_MAP_4K, add ? pages : -pages);
+}
+
+static int modify_pmd_table(pud_t *pud, unsigned long addr, unsigned long end,
+			    bool add)
+{
+	unsigned long next, prot, pages = 0;
+	int ret = -ENOMEM;
+	pmd_t *pmd;
+	pte_t *pte;
+
+	prot = pgprot_val(SEGMENT_KERNEL);
+	if (!MACHINE_HAS_NX)
+		prot &= ~_SEGMENT_ENTRY_NOEXEC;
+
+	pmd = pmd_offset(pud, addr);
+	for (; addr < end; addr = next, pmd++) {
+		next = pmd_addr_end(addr, end);
+
+		if (!add) {
+			if (pmd_none(*pmd))
+				continue;
+			if (pmd_large(*pmd) && !add) {
+				if (IS_ALIGNED(addr, PMD_SIZE) &&
+				    IS_ALIGNED(next, PMD_SIZE)) {
+					pmd_clear(pmd);
+					pages++;
+				}
+				continue;
+			}
+		} else if (pmd_none(*pmd)) {
+			if (IS_ALIGNED(addr, PMD_SIZE) &&
+			    IS_ALIGNED(next, PMD_SIZE) &&
+			    MACHINE_HAS_EDAT1 && addr &&
+			    !debug_pagealloc_enabled()) {
+				pmd_val(*pmd) = addr | prot;
+				pages++;
+				continue;
+			}
+			pte = vmem_pte_alloc();
+			if (!pte)
+				goto out;
+			pmd_populate(&init_mm, pmd, pte);
+		} else if (pmd_large(*pmd))
+			continue;
+
+		modify_pte_table(pmd, addr, next, add);
+	}
+	ret = 0;
+out:
+	update_page_count(PG_DIRECT_MAP_1M, add ? pages : -pages);
+	return ret;
+}
+
+static int modify_pud_table(p4d_t *p4d, unsigned long addr, unsigned long end,
+			    bool add)
+{
+	unsigned long next, prot, pages = 0;
+	int ret = -ENOMEM;
+	pud_t *pud;
+	pmd_t *pmd;
+
+	prot = pgprot_val(REGION3_KERNEL);
+	if (!MACHINE_HAS_NX)
+		prot &= ~_REGION_ENTRY_NOEXEC;
+
+	pud = pud_offset(p4d, addr);
+	for (; addr < end; addr = next, pud++) {
+		next = pud_addr_end(addr, end);
+
+		if (!add) {
+			if (pud_none(*pud))
+				continue;
+			if (pud_large(*pud)) {
+				if (IS_ALIGNED(addr, PUD_SIZE) &&
+				    IS_ALIGNED(next, PUD_SIZE)) {
+					pud_clear(pud);
+					pages++;
+				}
+				continue;
+			}
+		} else if (pud_none(*pud)) {
+			if (IS_ALIGNED(addr, PUD_SIZE) &&
+			    IS_ALIGNED(next, PUD_SIZE) &&
+			    MACHINE_HAS_EDAT2 && addr &&
+			    !debug_pagealloc_enabled()) {
+				pud_val(*pud) = addr | prot;
+				pages++;
+				continue;
+			}
+			pmd = vmem_crst_alloc(_SEGMENT_ENTRY_EMPTY);
+			if (!pmd)
+				goto out;
+			pud_populate(&init_mm, pud, pmd);
+		} else if (pud_large(*pud))
+			continue;
+
+		ret = modify_pmd_table(pud, addr, next, add);
+		if (ret)
+			goto out;
+	}
+	ret = 0;
+out:
+	update_page_count(PG_DIRECT_MAP_2G, add ? pages : -pages);
+	return ret;
+}
+
+static int modify_p4d_table(pgd_t *pgd, unsigned long addr, unsigned long end,
+			    bool add)
+{
+	unsigned long next;
+	int ret = -ENOMEM;
+	p4d_t *p4d;
+	pud_t *pud;
+
+	p4d = p4d_offset(pgd, addr);
+	for (; addr < end; addr = next, p4d++) {
+		next = p4d_addr_end(addr, end);
+
+		if (!add) {
+			if (p4d_none(*p4d))
+				continue;
+		} else if (p4d_none(*p4d)) {
+			pud = vmem_crst_alloc(_REGION3_ENTRY_EMPTY);
+			if (!pud)
+				goto out;
+		}
+
+		ret = modify_pud_table(p4d, addr, next, add);
+		if (ret)
+			goto out;
+	}
+	ret = 0;
+out:
+	return ret;
+}
+
+static int modify_pagetable(unsigned long start, unsigned long end, bool add)
+{
+	unsigned long addr, next;
+	int ret = -ENOMEM;
+	pgd_t *pgd;
+	p4d_t *p4d;
+
+	if (WARN_ON_ONCE(!PAGE_ALIGNED(start | end)))
+		return -EINVAL;
+
+	for (addr = start; addr < end; addr = next) {
+		next = pgd_addr_end(addr, end);
+		pgd = pgd_offset_k(addr);
+
+		if (!add) {
+			if (pgd_none(*pgd))
+				continue;
+		} else if (pgd_none(*pgd)) {
+			p4d = vmem_crst_alloc(_REGION2_ENTRY_EMPTY);
+			if (!p4d)
+				goto out;
+			pgd_populate(&init_mm, pgd, p4d);
+		}
+
+		ret = modify_p4d_table(pgd, addr, next, add);
+		if (ret)
+			goto out;
+	}
+	ret = 0;
+out:
+	if (!add)
+		flush_tlb_kernel_range(start, end);
+	return ret;
+}
+
+static int add_pagetable(unsigned long start, unsigned long end)
+{
+	return modify_pagetable(start, end, true);
+}
+
+static int remove_pagetable(unsigned long start, unsigned long end)
+{
+	return modify_pagetable(start, end, false);
+}
+
 /*
  * Add a physical memory range to the 1:1 mapping.
  */
 static int vmem_add_range(unsigned long start, unsigned long size)
 {
-	unsigned long pgt_prot, sgt_prot, r3_prot;
-	unsigned long pages4k, pages1m, pages2g;
-	unsigned long end = start + size;
-	unsigned long address = start;
-	pgd_t *pg_dir;
-	p4d_t *p4_dir;
-	pud_t *pu_dir;
-	pmd_t *pm_dir;
-	pte_t *pt_dir;
-	int ret = -ENOMEM;
-
-	pgt_prot = pgprot_val(PAGE_KERNEL);
-	sgt_prot = pgprot_val(SEGMENT_KERNEL);
-	r3_prot = pgprot_val(REGION3_KERNEL);
-	if (!MACHINE_HAS_NX) {
-		pgt_prot &= ~_PAGE_NOEXEC;
-		sgt_prot &= ~_SEGMENT_ENTRY_NOEXEC;
-		r3_prot &= ~_REGION_ENTRY_NOEXEC;
-	}
-	pages4k = pages1m = pages2g = 0;
-	while (address < end) {
-		pg_dir = pgd_offset_k(address);
-		if (pgd_none(*pg_dir)) {
-			p4_dir = vmem_crst_alloc(_REGION2_ENTRY_EMPTY);
-			if (!p4_dir)
-				goto out;
-			pgd_populate(&init_mm, pg_dir, p4_dir);
-		}
-		p4_dir = p4d_offset(pg_dir, address);
-		if (p4d_none(*p4_dir)) {
-			pu_dir = vmem_crst_alloc(_REGION3_ENTRY_EMPTY);
-			if (!pu_dir)
-				goto out;
-			p4d_populate(&init_mm, p4_dir, pu_dir);
-		}
-		pu_dir = pud_offset(p4_dir, address);
-		if (MACHINE_HAS_EDAT2 && pud_none(*pu_dir) && address &&
-		    !(address & ~PUD_MASK) && (address + PUD_SIZE <= end) &&
-		     !debug_pagealloc_enabled()) {
-			pud_val(*pu_dir) = address | r3_prot;
-			address += PUD_SIZE;
-			pages2g++;
-			continue;
-		}
-		if (pud_none(*pu_dir)) {
-			pm_dir = vmem_crst_alloc(_SEGMENT_ENTRY_EMPTY);
-			if (!pm_dir)
-				goto out;
-			pud_populate(&init_mm, pu_dir, pm_dir);
-		}
-		pm_dir = pmd_offset(pu_dir, address);
-		if (MACHINE_HAS_EDAT1 && pmd_none(*pm_dir) && address &&
-		    !(address & ~PMD_MASK) && (address + PMD_SIZE <= end) &&
-		    !debug_pagealloc_enabled()) {
-			pmd_val(*pm_dir) = address | sgt_prot;
-			address += PMD_SIZE;
-			pages1m++;
-			continue;
-		}
-		if (pmd_none(*pm_dir)) {
-			pt_dir = vmem_pte_alloc();
-			if (!pt_dir)
-				goto out;
-			pmd_populate(&init_mm, pm_dir, pt_dir);
-		}
-
-		pt_dir = pte_offset_kernel(pm_dir, address);
-		pte_val(*pt_dir) = address | pgt_prot;
-		address += PAGE_SIZE;
-		pages4k++;
-	}
-	ret = 0;
-out:
-	update_page_count(PG_DIRECT_MAP_4K, pages4k);
-	update_page_count(PG_DIRECT_MAP_1M, pages1m);
-	update_page_count(PG_DIRECT_MAP_2G, pages2g);
-	return ret;
+	return add_pagetable(start, start + size);
 }
 
 /*
@@ -144,58 +274,7 @@ out:
  */
 static void vmem_remove_range(unsigned long start, unsigned long size)
 {
-	unsigned long pages4k, pages1m, pages2g;
-	unsigned long end = start + size;
-	unsigned long address = start;
-	pgd_t *pg_dir;
-	p4d_t *p4_dir;
-	pud_t *pu_dir;
-	pmd_t *pm_dir;
-	pte_t *pt_dir;
-
-	pages4k = pages1m = pages2g = 0;
-	while (address < end) {
-		pg_dir = pgd_offset_k(address);
-		if (pgd_none(*pg_dir)) {
-			address += PGDIR_SIZE;
-			continue;
-		}
-		p4_dir = p4d_offset(pg_dir, address);
-		if (p4d_none(*p4_dir)) {
-			address += P4D_SIZE;
-			continue;
-		}
-		pu_dir = pud_offset(p4_dir, address);
-		if (pud_none(*pu_dir)) {
-			address += PUD_SIZE;
-			continue;
-		}
-		if (pud_large(*pu_dir)) {
-			pud_clear(pu_dir);
-			address += PUD_SIZE;
-			pages2g++;
-			continue;
-		}
-		pm_dir = pmd_offset(pu_dir, address);
-		if (pmd_none(*pm_dir)) {
-			address += PMD_SIZE;
-			continue;
-		}
-		if (pmd_large(*pm_dir)) {
-			pmd_clear(pm_dir);
-			address += PMD_SIZE;
-			pages1m++;
-			continue;
-		}
-		pt_dir = pte_offset_kernel(pm_dir, address);
-		pte_clear(&init_mm, address, pt_dir);
-		address += PAGE_SIZE;
-		pages4k++;
-	}
-	flush_tlb_kernel_range(start, end);
-	update_page_count(PG_DIRECT_MAP_4K, -pages4k);
-	update_page_count(PG_DIRECT_MAP_1M, -pages1m);
-	update_page_count(PG_DIRECT_MAP_2G, -pages2g);
+	remove_pagetable(start, start + size);
 }
 
 /*

From 9ec8fa8dc331be6b63726be696b2b21d0031a09b Mon Sep 17 00:00:00 2001
From: David Hildenbrand <david@redhat.com>
Date: Wed, 22 Jul 2020 11:45:52 +0200
Subject: [PATCH 451/502] s390/vmemmap: extend modify_pagetable() to handle
 vmemmap

Extend our shiny new modify_pagetable() to handle !direct (vmemmap)
mappings. Convert vmemmap_populate() and implement vmemmap_free().

Cc: Vasily Gorbik <gor@linux.ibm.com>
Cc: Christian Borntraeger <borntraeger@de.ibm.com>
Cc: Gerald Schaefer <gerald.schaefer@de.ibm.com>
Signed-off-by: David Hildenbrand <david@redhat.com>
Message-Id: <20200722094558.9828-4-david@redhat.com>
Signed-off-by: Heiko Carstens <hca@linux.ibm.com>
---
 arch/s390/mm/vmem.c | 181 +++++++++++++++++++-------------------------
 1 file changed, 76 insertions(+), 105 deletions(-)

diff --git a/arch/s390/mm/vmem.c b/arch/s390/mm/vmem.c
index 177daf389d39..43fe1e2eb90e 100644
--- a/arch/s390/mm/vmem.c
+++ b/arch/s390/mm/vmem.c
@@ -29,6 +29,15 @@ static void __ref *vmem_alloc_pages(unsigned int order)
 	return (void *) memblock_phys_alloc(size, size);
 }
 
+static void vmem_free_pages(unsigned long addr, int order)
+{
+	/* We don't expect boot memory to be removed ever. */
+	if (!slab_is_available() ||
+	    WARN_ON_ONCE(PageReserved(phys_to_page(addr))))
+		return;
+	free_pages(addr, order);
+}
+
 void *vmem_crst_alloc(unsigned long val)
 {
 	unsigned long *table;
@@ -54,10 +63,12 @@ pte_t __ref *vmem_pte_alloc(void)
 	return pte;
 }
 
-static void modify_pte_table(pmd_t *pmd, unsigned long addr, unsigned long end,
-			    bool add)
+/* __ref: we'll only call vmemmap_alloc_block() via vmemmap_populate() */
+static int __ref modify_pte_table(pmd_t *pmd, unsigned long addr,
+				  unsigned long end, bool add, bool direct)
 {
 	unsigned long prot, pages = 0;
+	int ret = -ENOMEM;
 	pte_t *pte;
 
 	prot = pgprot_val(PAGE_KERNEL);
@@ -69,20 +80,34 @@ static void modify_pte_table(pmd_t *pmd, unsigned long addr, unsigned long end,
 		if (!add) {
 			if (pte_none(*pte))
 				continue;
+			if (!direct)
+				vmem_free_pages(pfn_to_phys(pte_pfn(*pte)), 0);
 			pte_clear(&init_mm, addr, pte);
 		} else if (pte_none(*pte)) {
-			pte_val(*pte) = addr | prot;
+			if (!direct) {
+				void *new_page = vmemmap_alloc_block(PAGE_SIZE,
+								     NUMA_NO_NODE);
+
+				if (!new_page)
+					goto out;
+				pte_val(*pte) = __pa(new_page) | prot;
+			} else
+				pte_val(*pte) = addr | prot;
 		} else
 			continue;
 
 		pages++;
 	}
-
-	update_page_count(PG_DIRECT_MAP_4K, add ? pages : -pages);
+	ret = 0;
+out:
+	if (direct)
+		update_page_count(PG_DIRECT_MAP_4K, add ? pages : -pages);
+	return ret;
 }
 
-static int modify_pmd_table(pud_t *pud, unsigned long addr, unsigned long end,
-			    bool add)
+/* __ref: we'll only call vmemmap_alloc_block() via vmemmap_populate() */
+static int __ref modify_pmd_table(pud_t *pud, unsigned long addr,
+				  unsigned long end, bool add, bool direct)
 {
 	unsigned long next, prot, pages = 0;
 	int ret = -ENOMEM;
@@ -103,6 +128,9 @@ static int modify_pmd_table(pud_t *pud, unsigned long addr, unsigned long end,
 			if (pmd_large(*pmd) && !add) {
 				if (IS_ALIGNED(addr, PMD_SIZE) &&
 				    IS_ALIGNED(next, PMD_SIZE)) {
+					if (!direct)
+						vmem_free_pages(pmd_deref(*pmd),
+								get_order(PMD_SIZE));
 					pmd_clear(pmd);
 					pages++;
 				}
@@ -111,11 +139,27 @@ static int modify_pmd_table(pud_t *pud, unsigned long addr, unsigned long end,
 		} else if (pmd_none(*pmd)) {
 			if (IS_ALIGNED(addr, PMD_SIZE) &&
 			    IS_ALIGNED(next, PMD_SIZE) &&
-			    MACHINE_HAS_EDAT1 && addr &&
+			    MACHINE_HAS_EDAT1 && addr && direct &&
 			    !debug_pagealloc_enabled()) {
 				pmd_val(*pmd) = addr | prot;
 				pages++;
 				continue;
+			} else if (!direct && MACHINE_HAS_EDAT1) {
+				void *new_page;
+
+				/*
+				 * Use 1MB frames for vmemmap if available. We
+				 * always use large frames even if they are only
+				 * partially used. Otherwise we would have also
+				 * page tables since vmemmap_populate gets
+				 * called for each section separately.
+				 */
+				new_page = vmemmap_alloc_block(PMD_SIZE,
+							       NUMA_NO_NODE);
+				if (!new_page)
+					goto out;
+				pmd_val(*pmd) = __pa(new_page) | prot;
+				continue;
 			}
 			pte = vmem_pte_alloc();
 			if (!pte)
@@ -124,16 +168,19 @@ static int modify_pmd_table(pud_t *pud, unsigned long addr, unsigned long end,
 		} else if (pmd_large(*pmd))
 			continue;
 
-		modify_pte_table(pmd, addr, next, add);
+		ret = modify_pte_table(pmd, addr, next, add, direct);
+		if (ret)
+			goto out;
 	}
 	ret = 0;
 out:
-	update_page_count(PG_DIRECT_MAP_1M, add ? pages : -pages);
+	if (direct)
+		update_page_count(PG_DIRECT_MAP_1M, add ? pages : -pages);
 	return ret;
 }
 
 static int modify_pud_table(p4d_t *p4d, unsigned long addr, unsigned long end,
-			    bool add)
+			    bool add, bool direct)
 {
 	unsigned long next, prot, pages = 0;
 	int ret = -ENOMEM;
@@ -162,7 +209,7 @@ static int modify_pud_table(p4d_t *p4d, unsigned long addr, unsigned long end,
 		} else if (pud_none(*pud)) {
 			if (IS_ALIGNED(addr, PUD_SIZE) &&
 			    IS_ALIGNED(next, PUD_SIZE) &&
-			    MACHINE_HAS_EDAT2 && addr &&
+			    MACHINE_HAS_EDAT2 && addr && direct &&
 			    !debug_pagealloc_enabled()) {
 				pud_val(*pud) = addr | prot;
 				pages++;
@@ -175,18 +222,19 @@ static int modify_pud_table(p4d_t *p4d, unsigned long addr, unsigned long end,
 		} else if (pud_large(*pud))
 			continue;
 
-		ret = modify_pmd_table(pud, addr, next, add);
+		ret = modify_pmd_table(pud, addr, next, add, direct);
 		if (ret)
 			goto out;
 	}
 	ret = 0;
 out:
-	update_page_count(PG_DIRECT_MAP_2G, add ? pages : -pages);
+	if (direct)
+		update_page_count(PG_DIRECT_MAP_2G, add ? pages : -pages);
 	return ret;
 }
 
 static int modify_p4d_table(pgd_t *pgd, unsigned long addr, unsigned long end,
-			    bool add)
+			    bool add, bool direct)
 {
 	unsigned long next;
 	int ret = -ENOMEM;
@@ -206,7 +254,7 @@ static int modify_p4d_table(pgd_t *pgd, unsigned long addr, unsigned long end,
 				goto out;
 		}
 
-		ret = modify_pud_table(p4d, addr, next, add);
+		ret = modify_pud_table(p4d, addr, next, add, direct);
 		if (ret)
 			goto out;
 	}
@@ -215,7 +263,8 @@ out:
 	return ret;
 }
 
-static int modify_pagetable(unsigned long start, unsigned long end, bool add)
+static int modify_pagetable(unsigned long start, unsigned long end, bool add,
+			    bool direct)
 {
 	unsigned long addr, next;
 	int ret = -ENOMEM;
@@ -239,7 +288,7 @@ static int modify_pagetable(unsigned long start, unsigned long end, bool add)
 			pgd_populate(&init_mm, pgd, p4d);
 		}
 
-		ret = modify_p4d_table(pgd, addr, next, add);
+		ret = modify_p4d_table(pgd, addr, next, add, direct);
 		if (ret)
 			goto out;
 	}
@@ -250,14 +299,14 @@ out:
 	return ret;
 }
 
-static int add_pagetable(unsigned long start, unsigned long end)
+static int add_pagetable(unsigned long start, unsigned long end, bool direct)
 {
-	return modify_pagetable(start, end, true);
+	return modify_pagetable(start, end, true, direct);
 }
 
-static int remove_pagetable(unsigned long start, unsigned long end)
+static int remove_pagetable(unsigned long start, unsigned long end, bool direct)
 {
-	return modify_pagetable(start, end, false);
+	return modify_pagetable(start, end, false, direct);
 }
 
 /*
@@ -265,7 +314,7 @@ static int remove_pagetable(unsigned long start, unsigned long end)
  */
 static int vmem_add_range(unsigned long start, unsigned long size)
 {
-	return add_pagetable(start, start + size);
+	return add_pagetable(start, start + size, true);
 }
 
 /*
@@ -274,7 +323,7 @@ static int vmem_add_range(unsigned long start, unsigned long size)
  */
 static void vmem_remove_range(unsigned long start, unsigned long size)
 {
-	remove_pagetable(start, start + size);
+	remove_pagetable(start, start + size, true);
 }
 
 /*
@@ -283,92 +332,14 @@ static void vmem_remove_range(unsigned long start, unsigned long size)
 int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node,
 		struct vmem_altmap *altmap)
 {
-	unsigned long pgt_prot, sgt_prot;
-	unsigned long address = start;
-	pgd_t *pg_dir;
-	p4d_t *p4_dir;
-	pud_t *pu_dir;
-	pmd_t *pm_dir;
-	pte_t *pt_dir;
-	int ret = -ENOMEM;
-
-	pgt_prot = pgprot_val(PAGE_KERNEL);
-	sgt_prot = pgprot_val(SEGMENT_KERNEL);
-	if (!MACHINE_HAS_NX) {
-		pgt_prot &= ~_PAGE_NOEXEC;
-		sgt_prot &= ~_SEGMENT_ENTRY_NOEXEC;
-	}
-	for (address = start; address < end;) {
-		pg_dir = pgd_offset_k(address);
-		if (pgd_none(*pg_dir)) {
-			p4_dir = vmem_crst_alloc(_REGION2_ENTRY_EMPTY);
-			if (!p4_dir)
-				goto out;
-			pgd_populate(&init_mm, pg_dir, p4_dir);
-		}
-
-		p4_dir = p4d_offset(pg_dir, address);
-		if (p4d_none(*p4_dir)) {
-			pu_dir = vmem_crst_alloc(_REGION3_ENTRY_EMPTY);
-			if (!pu_dir)
-				goto out;
-			p4d_populate(&init_mm, p4_dir, pu_dir);
-		}
-
-		pu_dir = pud_offset(p4_dir, address);
-		if (pud_none(*pu_dir)) {
-			pm_dir = vmem_crst_alloc(_SEGMENT_ENTRY_EMPTY);
-			if (!pm_dir)
-				goto out;
-			pud_populate(&init_mm, pu_dir, pm_dir);
-		}
-
-		pm_dir = pmd_offset(pu_dir, address);
-		if (pmd_none(*pm_dir)) {
-			/* Use 1MB frames for vmemmap if available. We always
-			 * use large frames even if they are only partially
-			 * used.
-			 * Otherwise we would have also page tables since
-			 * vmemmap_populate gets called for each section
-			 * separately. */
-			if (MACHINE_HAS_EDAT1) {
-				void *new_page;
-
-				new_page = vmemmap_alloc_block(PMD_SIZE, node);
-				if (!new_page)
-					goto out;
-				pmd_val(*pm_dir) = __pa(new_page) | sgt_prot;
-				address = (address + PMD_SIZE) & PMD_MASK;
-				continue;
-			}
-			pt_dir = vmem_pte_alloc();
-			if (!pt_dir)
-				goto out;
-			pmd_populate(&init_mm, pm_dir, pt_dir);
-		} else if (pmd_large(*pm_dir)) {
-			address = (address + PMD_SIZE) & PMD_MASK;
-			continue;
-		}
-
-		pt_dir = pte_offset_kernel(pm_dir, address);
-		if (pte_none(*pt_dir)) {
-			void *new_page;
-
-			new_page = vmemmap_alloc_block(PAGE_SIZE, node);
-			if (!new_page)
-				goto out;
-			pte_val(*pt_dir) = __pa(new_page) | pgt_prot;
-		}
-		address += PAGE_SIZE;
-	}
-	ret = 0;
-out:
-	return ret;
+	/* We don't care about the node, just use NUMA_NO_NODE on allocations */
+	return add_pagetable(start, end, false);
 }
 
 void vmemmap_free(unsigned long start, unsigned long end,
 		struct vmem_altmap *altmap)
 {
+	remove_pagetable(start, end, false);
 }
 
 void vmem_remove_mapping(unsigned long start, unsigned long size)

From c00f05a92424c7788fdbf0909b823f8027596d66 Mon Sep 17 00:00:00 2001
From: David Hildenbrand <david@redhat.com>
Date: Wed, 22 Jul 2020 11:45:53 +0200
Subject: [PATCH 452/502] s390/vmemmap: cleanup when vmemmap_populate() fails

Cleanup what we partially added in case vmemmap_populate() fails. For
vmem, this is already handled by vmem_add_mapping().

Cc: Vasily Gorbik <gor@linux.ibm.com>
Cc: Christian Borntraeger <borntraeger@de.ibm.com>
Cc: Gerald Schaefer <gerald.schaefer@de.ibm.com>
Signed-off-by: David Hildenbrand <david@redhat.com>
Message-Id: <20200722094558.9828-5-david@redhat.com>
Signed-off-by: Heiko Carstens <hca@linux.ibm.com>
---
 arch/s390/mm/vmem.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/arch/s390/mm/vmem.c b/arch/s390/mm/vmem.c
index 43fe1e2eb90e..be32a38bb91f 100644
--- a/arch/s390/mm/vmem.c
+++ b/arch/s390/mm/vmem.c
@@ -332,8 +332,13 @@ static void vmem_remove_range(unsigned long start, unsigned long size)
 int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node,
 		struct vmem_altmap *altmap)
 {
+	int ret;
+
 	/* We don't care about the node, just use NUMA_NO_NODE on allocations */
-	return add_pagetable(start, end, false);
+	ret = add_pagetable(start, end, false);
+	if (ret)
+		remove_pagetable(start, end, false);
+	return ret;
 }
 
 void vmemmap_free(unsigned long start, unsigned long end,

From aa18e0e65800bf3250b23914a28e0e3fd9cadec2 Mon Sep 17 00:00:00 2001
From: David Hildenbrand <david@redhat.com>
Date: Wed, 22 Jul 2020 11:45:54 +0200
Subject: [PATCH 453/502] s390/vmemmap: take the vmem_mutex when
 populating/freeing

Let's synchronize all accesses to the 1:1 and vmemmap mappings. This will
be especially relevant when wanting to cleanup empty page tables that could
be shared by both. Avoid races when removing tables that might be just
about to get reused.

Cc: Vasily Gorbik <gor@linux.ibm.com>
Cc: Christian Borntraeger <borntraeger@de.ibm.com>
Cc: Gerald Schaefer <gerald.schaefer@de.ibm.com>
Signed-off-by: David Hildenbrand <david@redhat.com>
Message-Id: <20200722094558.9828-6-david@redhat.com>
Signed-off-by: Heiko Carstens <hca@linux.ibm.com>
---
 arch/s390/mm/vmem.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/arch/s390/mm/vmem.c b/arch/s390/mm/vmem.c
index be32a38bb91f..a2b79681df69 100644
--- a/arch/s390/mm/vmem.c
+++ b/arch/s390/mm/vmem.c
@@ -334,17 +334,21 @@ int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node,
 {
 	int ret;
 
+	mutex_lock(&vmem_mutex);
 	/* We don't care about the node, just use NUMA_NO_NODE on allocations */
 	ret = add_pagetable(start, end, false);
 	if (ret)
 		remove_pagetable(start, end, false);
+	mutex_unlock(&vmem_mutex);
 	return ret;
 }
 
 void vmemmap_free(unsigned long start, unsigned long end,
 		struct vmem_altmap *altmap)
 {
+	mutex_lock(&vmem_mutex);
 	remove_pagetable(start, end, false);
+	mutex_unlock(&vmem_mutex);
 }
 
 void vmem_remove_mapping(unsigned long start, unsigned long size)

From b9ff81003cf1a0b12b8d60b6ef33a34e84dfe7ac Mon Sep 17 00:00:00 2001
From: David Hildenbrand <david@redhat.com>
Date: Wed, 22 Jul 2020 11:45:55 +0200
Subject: [PATCH 454/502] s390/vmem: cleanup empty page tables

Let's cleanup empty page tables. Consider only page tables that fully
fall into the idendity mapping and the vmemmap range.

As there are no valid accesses to vmem/vmemmap within non-populated ranges,
the single tlb flush at the end should be sufficient.

Cc: Vasily Gorbik <gor@linux.ibm.com>
Cc: Christian Borntraeger <borntraeger@de.ibm.com>
Cc: Gerald Schaefer <gerald.schaefer@de.ibm.com>
Signed-off-by: David Hildenbrand <david@redhat.com>
Message-Id: <20200722094558.9828-7-david@redhat.com>
Signed-off-by: Heiko Carstens <hca@linux.ibm.com>
---
 arch/s390/mm/vmem.c | 102 +++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 101 insertions(+), 1 deletion(-)

diff --git a/arch/s390/mm/vmem.c b/arch/s390/mm/vmem.c
index a2b79681df69..b831f9f9130a 100644
--- a/arch/s390/mm/vmem.c
+++ b/arch/s390/mm/vmem.c
@@ -63,6 +63,15 @@ pte_t __ref *vmem_pte_alloc(void)
 	return pte;
 }
 
+static void vmem_pte_free(unsigned long *table)
+{
+	/* We don't expect boot memory to be removed ever. */
+	if (!slab_is_available() ||
+	    WARN_ON_ONCE(PageReserved(virt_to_page(table))))
+		return;
+	page_table_free(&init_mm, table);
+}
+
 /* __ref: we'll only call vmemmap_alloc_block() via vmemmap_populate() */
 static int __ref modify_pte_table(pmd_t *pmd, unsigned long addr,
 				  unsigned long end, bool add, bool direct)
@@ -105,6 +114,21 @@ out:
 	return ret;
 }
 
+static void try_free_pte_table(pmd_t *pmd, unsigned long start)
+{
+	pte_t *pte;
+	int i;
+
+	/* We can safely assume this is fully in 1:1 mapping & vmemmap area */
+	pte = pte_offset_kernel(pmd, start);
+	for (i = 0; i < PTRS_PER_PTE; i++, pte++)
+		if (!pte_none(*pte))
+			return;
+
+	vmem_pte_free(__va(pmd_deref(*pmd)));
+	pmd_clear(pmd);
+}
+
 /* __ref: we'll only call vmemmap_alloc_block() via vmemmap_populate() */
 static int __ref modify_pmd_table(pud_t *pud, unsigned long addr,
 				  unsigned long end, bool add, bool direct)
@@ -171,6 +195,8 @@ static int __ref modify_pmd_table(pud_t *pud, unsigned long addr,
 		ret = modify_pte_table(pmd, addr, next, add, direct);
 		if (ret)
 			goto out;
+		if (!add)
+			try_free_pte_table(pmd, addr & PMD_MASK);
 	}
 	ret = 0;
 out:
@@ -179,6 +205,29 @@ out:
 	return ret;
 }
 
+static void try_free_pmd_table(pud_t *pud, unsigned long start)
+{
+	const unsigned long end = start + PUD_SIZE;
+	pmd_t *pmd;
+	int i;
+
+	/* Don't mess with any tables not fully in 1:1 mapping & vmemmap area */
+	if (end > VMALLOC_START)
+		return;
+#ifdef CONFIG_KASAN
+	if (start < KASAN_SHADOW_END && KASAN_SHADOW_START > end)
+		return;
+#endif
+
+	pmd = pmd_offset(pud, start);
+	for (i = 0; i < PTRS_PER_PMD; i++, pmd++)
+		if (!pmd_none(*pmd))
+			return;
+
+	vmem_free_pages(pud_deref(*pud), CRST_ALLOC_ORDER);
+	pud_clear(pud);
+}
+
 static int modify_pud_table(p4d_t *p4d, unsigned long addr, unsigned long end,
 			    bool add, bool direct)
 {
@@ -225,6 +274,8 @@ static int modify_pud_table(p4d_t *p4d, unsigned long addr, unsigned long end,
 		ret = modify_pmd_table(pud, addr, next, add, direct);
 		if (ret)
 			goto out;
+		if (!add)
+			try_free_pmd_table(pud, addr & PUD_MASK);
 	}
 	ret = 0;
 out:
@@ -233,6 +284,29 @@ out:
 	return ret;
 }
 
+static void try_free_pud_table(p4d_t *p4d, unsigned long start)
+{
+	const unsigned long end = start + P4D_SIZE;
+	pud_t *pud;
+	int i;
+
+	/* Don't mess with any tables not fully in 1:1 mapping & vmemmap area */
+	if (end > VMALLOC_START)
+		return;
+#ifdef CONFIG_KASAN
+	if (start < KASAN_SHADOW_END && KASAN_SHADOW_START > end)
+		return;
+#endif
+
+	pud = pud_offset(p4d, start);
+	for (i = 0; i < PTRS_PER_PUD; i++, pud++)
+		if (!pud_none(*pud))
+			return;
+
+	vmem_free_pages(p4d_deref(*p4d), CRST_ALLOC_ORDER);
+	p4d_clear(p4d);
+}
+
 static int modify_p4d_table(pgd_t *pgd, unsigned long addr, unsigned long end,
 			    bool add, bool direct)
 {
@@ -257,12 +331,37 @@ static int modify_p4d_table(pgd_t *pgd, unsigned long addr, unsigned long end,
 		ret = modify_pud_table(p4d, addr, next, add, direct);
 		if (ret)
 			goto out;
+		if (!add)
+			try_free_pud_table(p4d, addr & P4D_MASK);
 	}
 	ret = 0;
 out:
 	return ret;
 }
 
+static void try_free_p4d_table(pgd_t *pgd, unsigned long start)
+{
+	const unsigned long end = start + PGDIR_SIZE;
+	p4d_t *p4d;
+	int i;
+
+	/* Don't mess with any tables not fully in 1:1 mapping & vmemmap area */
+	if (end > VMALLOC_START)
+		return;
+#ifdef CONFIG_KASAN
+	if (start < KASAN_SHADOW_END && KASAN_SHADOW_START > end)
+		return;
+#endif
+
+	p4d = p4d_offset(pgd, start);
+	for (i = 0; i < PTRS_PER_P4D; i++, p4d++)
+		if (!p4d_none(*p4d))
+			return;
+
+	vmem_free_pages(pgd_deref(*pgd), CRST_ALLOC_ORDER);
+	pgd_clear(pgd);
+}
+
 static int modify_pagetable(unsigned long start, unsigned long end, bool add,
 			    bool direct)
 {
@@ -291,6 +390,8 @@ static int modify_pagetable(unsigned long start, unsigned long end, bool add,
 		ret = modify_p4d_table(pgd, addr, next, add, direct);
 		if (ret)
 			goto out;
+		if (!add)
+			try_free_p4d_table(pgd, addr & PGDIR_MASK);
 	}
 	ret = 0;
 out:
@@ -319,7 +420,6 @@ static int vmem_add_range(unsigned long start, unsigned long size)
 
 /*
  * Remove a physical memory range from the 1:1 mapping.
- * Currently only invalidates page table entries.
  */
 static void vmem_remove_range(unsigned long start, unsigned long size)
 {

From f2057b4266a6be469ea0630971cf3cd933e42cce Mon Sep 17 00:00:00 2001
From: David Hildenbrand <david@redhat.com>
Date: Wed, 22 Jul 2020 11:45:56 +0200
Subject: [PATCH 455/502] s390/vmemmap: fallback to PTEs if mapping large PMD
 fails

Let's fallback to single pages if short on huge pages. No need to stop
memory hotplug.

Cc: Vasily Gorbik <gor@linux.ibm.com>
Cc: Christian Borntraeger <borntraeger@de.ibm.com>
Cc: Gerald Schaefer <gerald.schaefer@de.ibm.com>
Signed-off-by: David Hildenbrand <david@redhat.com>
Message-Id: <20200722094558.9828-8-david@redhat.com>
Signed-off-by: Heiko Carstens <hca@linux.ibm.com>
---
 arch/s390/mm/vmem.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/arch/s390/mm/vmem.c b/arch/s390/mm/vmem.c
index b831f9f9130a..e82a63de19db 100644
--- a/arch/s390/mm/vmem.c
+++ b/arch/s390/mm/vmem.c
@@ -180,10 +180,10 @@ static int __ref modify_pmd_table(pud_t *pud, unsigned long addr,
 				 */
 				new_page = vmemmap_alloc_block(PMD_SIZE,
 							       NUMA_NO_NODE);
-				if (!new_page)
-					goto out;
-				pmd_val(*pmd) = __pa(new_page) | prot;
-				continue;
+				if (new_page) {
+					pmd_val(*pmd) = __pa(new_page) | prot;
+					continue;
+				}
 			}
 			pte = vmem_pte_alloc();
 			if (!pte)

From cd5781d63eaf6dbf89532d8c7c214786b767ee16 Mon Sep 17 00:00:00 2001
From: David Hildenbrand <david@redhat.com>
Date: Wed, 22 Jul 2020 11:45:57 +0200
Subject: [PATCH 456/502] s390/vmemmap: remember unused sub-pmd ranges

With a memmap size of 56 bytes or 72 bytes per page, the memmap for a
256 MB section won't span full PMDs. As we populate single sections and
depopulate single sections, the depopulation step would not be able to
free all vmemmap pmds anymore.

Do it similarly to x86, marking the unused memmap ranges in a special way
(pad it with 0xFD).

This allows us to add/remove sections, cleaning up all allocated
vmemmap pages even if the memmap size is not multiple of 16 bytes per page.

A 56 byte memmap can, for example, be created with !CONFIG_MEMCG and
!CONFIG_SLUB.

Cc: Vasily Gorbik <gor@linux.ibm.com>
Cc: Christian Borntraeger <borntraeger@de.ibm.com>
Cc: Gerald Schaefer <gerald.schaefer@de.ibm.com>
Signed-off-by: David Hildenbrand <david@redhat.com>
Message-Id: <20200722094558.9828-9-david@redhat.com>
Signed-off-by: Heiko Carstens <hca@linux.ibm.com>
---
 arch/s390/mm/vmem.c | 51 ++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 50 insertions(+), 1 deletion(-)

diff --git a/arch/s390/mm/vmem.c b/arch/s390/mm/vmem.c
index e82a63de19db..df361bbacda1 100644
--- a/arch/s390/mm/vmem.c
+++ b/arch/s390/mm/vmem.c
@@ -72,6 +72,42 @@ static void vmem_pte_free(unsigned long *table)
 	page_table_free(&init_mm, table);
 }
 
+#define PAGE_UNUSED 0xFD
+
+static void vmemmap_use_sub_pmd(unsigned long start, unsigned long end)
+{
+	/*
+	 * As we expect to add in the same granularity as we remove, it's
+	 * sufficient to mark only some piece used to block the memmap page from
+	 * getting removed (just in case the memmap never gets initialized,
+	 * e.g., because the memory block never gets onlined).
+	 */
+	memset(__va(start), 0, sizeof(struct page));
+}
+
+static void vmemmap_use_new_sub_pmd(unsigned long start, unsigned long end)
+{
+	void *page = __va(ALIGN_DOWN(start, PMD_SIZE));
+
+	/* Could be our memmap page is filled with PAGE_UNUSED already ... */
+	vmemmap_use_sub_pmd(start, end);
+
+	/* Mark the unused parts of the new memmap page PAGE_UNUSED. */
+	if (!IS_ALIGNED(start, PMD_SIZE))
+		memset(page, PAGE_UNUSED, start - __pa(page));
+	if (!IS_ALIGNED(end, PMD_SIZE))
+		memset(__va(end), PAGE_UNUSED, __pa(page) + PMD_SIZE - end);
+}
+
+/* Returns true if the PMD is completely unused and can be freed. */
+static bool vmemmap_unuse_sub_pmd(unsigned long start, unsigned long end)
+{
+	void *page = __va(ALIGN_DOWN(start, PMD_SIZE));
+
+	memset(__va(start), PAGE_UNUSED, end - start);
+	return !memchr_inv(page, PAGE_UNUSED, PMD_SIZE);
+}
+
 /* __ref: we'll only call vmemmap_alloc_block() via vmemmap_populate() */
 static int __ref modify_pte_table(pmd_t *pmd, unsigned long addr,
 				  unsigned long end, bool add, bool direct)
@@ -157,6 +193,11 @@ static int __ref modify_pmd_table(pud_t *pud, unsigned long addr,
 								get_order(PMD_SIZE));
 					pmd_clear(pmd);
 					pages++;
+				} else if (!direct &&
+					   vmemmap_unuse_sub_pmd(addr, next)) {
+					vmem_free_pages(pmd_deref(*pmd),
+							get_order(PMD_SIZE));
+					pmd_clear(pmd);
 				}
 				continue;
 			}
@@ -182,6 +223,11 @@ static int __ref modify_pmd_table(pud_t *pud, unsigned long addr,
 							       NUMA_NO_NODE);
 				if (new_page) {
 					pmd_val(*pmd) = __pa(new_page) | prot;
+					if (!IS_ALIGNED(addr, PMD_SIZE) ||
+					    !IS_ALIGNED(next, PMD_SIZE)) {
+						vmemmap_use_new_sub_pmd(addr,
+									next);
+					}
 					continue;
 				}
 			}
@@ -189,8 +235,11 @@ static int __ref modify_pmd_table(pud_t *pud, unsigned long addr,
 			if (!pte)
 				goto out;
 			pmd_populate(&init_mm, pmd, pte);
-		} else if (pmd_large(*pmd))
+		} else if (pmd_large(*pmd)) {
+			if (!direct)
+				vmemmap_use_sub_pmd(addr, next);
 			continue;
+		}
 
 		ret = modify_pte_table(pmd, addr, next, add, direct);
 		if (ret)

From 2c114df071935762ffa88144cdab03d84beaa702 Mon Sep 17 00:00:00 2001
From: David Hildenbrand <david@redhat.com>
Date: Wed, 22 Jul 2020 11:45:58 +0200
Subject: [PATCH 457/502] s390/vmemmap: avoid memset(PAGE_UNUSED) when adding
 consecutive sections

Let's avoid memset(PAGE_UNUSED) when adding consecutive sections,
whereby the vmemmap of a single section does not span full PMDs.

Cc: Vasily Gorbik <gor@linux.ibm.com>
Cc: Christian Borntraeger <borntraeger@de.ibm.com>
Cc: Gerald Schaefer <gerald.schaefer@de.ibm.com>
Signed-off-by: David Hildenbrand <david@redhat.com>
Message-Id: <20200722094558.9828-10-david@redhat.com>
Signed-off-by: Heiko Carstens <hca@linux.ibm.com>
---
 arch/s390/mm/vmem.c | 45 ++++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 42 insertions(+), 3 deletions(-)

diff --git a/arch/s390/mm/vmem.c b/arch/s390/mm/vmem.c
index df361bbacda1..70ebfc7958a6 100644
--- a/arch/s390/mm/vmem.c
+++ b/arch/s390/mm/vmem.c
@@ -74,7 +74,22 @@ static void vmem_pte_free(unsigned long *table)
 
 #define PAGE_UNUSED 0xFD
 
-static void vmemmap_use_sub_pmd(unsigned long start, unsigned long end)
+/*
+ * The unused vmemmap range, which was not yet memset(PAGE_UNUSED) ranges
+ * from unused_pmd_start to next PMD_SIZE boundary.
+ */
+static unsigned long unused_pmd_start;
+
+static void vmemmap_flush_unused_pmd(void)
+{
+	if (!unused_pmd_start)
+		return;
+	memset(__va(unused_pmd_start), PAGE_UNUSED,
+	       ALIGN(unused_pmd_start, PMD_SIZE) - unused_pmd_start);
+	unused_pmd_start = 0;
+}
+
+static void __vmemmap_use_sub_pmd(unsigned long start, unsigned long end)
 {
 	/*
 	 * As we expect to add in the same granularity as we remove, it's
@@ -85,18 +100,41 @@ static void vmemmap_use_sub_pmd(unsigned long start, unsigned long end)
 	memset(__va(start), 0, sizeof(struct page));
 }
 
+static void vmemmap_use_sub_pmd(unsigned long start, unsigned long end)
+{
+	/*
+	 * We only optimize if the new used range directly follows the
+	 * previously unused range (esp., when populating consecutive sections).
+	 */
+	if (unused_pmd_start == start) {
+		unused_pmd_start = end;
+		if (likely(IS_ALIGNED(unused_pmd_start, PMD_SIZE)))
+			unused_pmd_start = 0;
+		return;
+	}
+	vmemmap_flush_unused_pmd();
+	__vmemmap_use_sub_pmd(start, end);
+}
+
 static void vmemmap_use_new_sub_pmd(unsigned long start, unsigned long end)
 {
 	void *page = __va(ALIGN_DOWN(start, PMD_SIZE));
 
+	vmemmap_flush_unused_pmd();
+
 	/* Could be our memmap page is filled with PAGE_UNUSED already ... */
-	vmemmap_use_sub_pmd(start, end);
+	__vmemmap_use_sub_pmd(start, end);
 
 	/* Mark the unused parts of the new memmap page PAGE_UNUSED. */
 	if (!IS_ALIGNED(start, PMD_SIZE))
 		memset(page, PAGE_UNUSED, start - __pa(page));
+	/*
+	 * We want to avoid memset(PAGE_UNUSED) when populating the vmemmap of
+	 * consecutive sections. Remember for the last added PMD the last
+	 * unused range in the populated PMD.
+	 */
 	if (!IS_ALIGNED(end, PMD_SIZE))
-		memset(__va(end), PAGE_UNUSED, __pa(page) + PMD_SIZE - end);
+		unused_pmd_start = end;
 }
 
 /* Returns true if the PMD is completely unused and can be freed. */
@@ -104,6 +142,7 @@ static bool vmemmap_unuse_sub_pmd(unsigned long start, unsigned long end)
 {
 	void *page = __va(ALIGN_DOWN(start, PMD_SIZE));
 
+	vmemmap_flush_unused_pmd();
 	memset(__va(start), PAGE_UNUSED, end - start);
 	return !memchr_inv(page, PAGE_UNUSED, PMD_SIZE);
 }

From 9a996c67a65d937b23408e56935ef23404c9418e Mon Sep 17 00:00:00 2001
From: Heiko Carstens <hca@linux.ibm.com>
Date: Thu, 23 Jul 2020 21:42:36 +0200
Subject: [PATCH 458/502] s390/vmemmap: coding style updates

Signed-off-by: Heiko Carstens <hca@linux.ibm.com>
---
 arch/s390/mm/vmem.c | 55 +++++++++++++++++----------------------------
 1 file changed, 20 insertions(+), 35 deletions(-)

diff --git a/arch/s390/mm/vmem.c b/arch/s390/mm/vmem.c
index 70ebfc7958a6..1aed1a4dfc2d 100644
--- a/arch/s390/mm/vmem.c
+++ b/arch/s390/mm/vmem.c
@@ -169,17 +169,17 @@ static int __ref modify_pte_table(pmd_t *pmd, unsigned long addr,
 			pte_clear(&init_mm, addr, pte);
 		} else if (pte_none(*pte)) {
 			if (!direct) {
-				void *new_page = vmemmap_alloc_block(PAGE_SIZE,
-								     NUMA_NO_NODE);
+				void *new_page = vmemmap_alloc_block(PAGE_SIZE, NUMA_NO_NODE);
 
 				if (!new_page)
 					goto out;
 				pte_val(*pte) = __pa(new_page) | prot;
-			} else
+			} else {
 				pte_val(*pte) = addr | prot;
-		} else
+			}
+		} else {
 			continue;
-
+		}
 		pages++;
 	}
 	ret = 0;
@@ -196,10 +196,10 @@ static void try_free_pte_table(pmd_t *pmd, unsigned long start)
 
 	/* We can safely assume this is fully in 1:1 mapping & vmemmap area */
 	pte = pte_offset_kernel(pmd, start);
-	for (i = 0; i < PTRS_PER_PTE; i++, pte++)
+	for (i = 0; i < PTRS_PER_PTE; i++, pte++) {
 		if (!pte_none(*pte))
 			return;
-
+	}
 	vmem_pte_free(__va(pmd_deref(*pmd)));
 	pmd_clear(pmd);
 }
@@ -220,7 +220,6 @@ static int __ref modify_pmd_table(pud_t *pud, unsigned long addr,
 	pmd = pmd_offset(pud, addr);
 	for (; addr < end; addr = next, pmd++) {
 		next = pmd_addr_end(addr, end);
-
 		if (!add) {
 			if (pmd_none(*pmd))
 				continue;
@@ -228,14 +227,11 @@ static int __ref modify_pmd_table(pud_t *pud, unsigned long addr,
 				if (IS_ALIGNED(addr, PMD_SIZE) &&
 				    IS_ALIGNED(next, PMD_SIZE)) {
 					if (!direct)
-						vmem_free_pages(pmd_deref(*pmd),
-								get_order(PMD_SIZE));
+						vmem_free_pages(pmd_deref(*pmd), get_order(PMD_SIZE));
 					pmd_clear(pmd);
 					pages++;
-				} else if (!direct &&
-					   vmemmap_unuse_sub_pmd(addr, next)) {
-					vmem_free_pages(pmd_deref(*pmd),
-							get_order(PMD_SIZE));
+				} else if (!direct && vmemmap_unuse_sub_pmd(addr, next)) {
+					vmem_free_pages(pmd_deref(*pmd), get_order(PMD_SIZE));
 					pmd_clear(pmd);
 				}
 				continue;
@@ -258,14 +254,12 @@ static int __ref modify_pmd_table(pud_t *pud, unsigned long addr,
 				 * page tables since vmemmap_populate gets
 				 * called for each section separately.
 				 */
-				new_page = vmemmap_alloc_block(PMD_SIZE,
-							       NUMA_NO_NODE);
+				new_page = vmemmap_alloc_block(PMD_SIZE, NUMA_NO_NODE);
 				if (new_page) {
 					pmd_val(*pmd) = __pa(new_page) | prot;
 					if (!IS_ALIGNED(addr, PMD_SIZE) ||
 					    !IS_ALIGNED(next, PMD_SIZE)) {
-						vmemmap_use_new_sub_pmd(addr,
-									next);
+						vmemmap_use_new_sub_pmd(addr, next);
 					}
 					continue;
 				}
@@ -279,7 +273,6 @@ static int __ref modify_pmd_table(pud_t *pud, unsigned long addr,
 				vmemmap_use_sub_pmd(addr, next);
 			continue;
 		}
-
 		ret = modify_pte_table(pmd, addr, next, add, direct);
 		if (ret)
 			goto out;
@@ -306,12 +299,10 @@ static void try_free_pmd_table(pud_t *pud, unsigned long start)
 	if (start < KASAN_SHADOW_END && KASAN_SHADOW_START > end)
 		return;
 #endif
-
 	pmd = pmd_offset(pud, start);
 	for (i = 0; i < PTRS_PER_PMD; i++, pmd++)
 		if (!pmd_none(*pmd))
 			return;
-
 	vmem_free_pages(pud_deref(*pud), CRST_ALLOC_ORDER);
 	pud_clear(pud);
 }
@@ -327,11 +318,9 @@ static int modify_pud_table(p4d_t *p4d, unsigned long addr, unsigned long end,
 	prot = pgprot_val(REGION3_KERNEL);
 	if (!MACHINE_HAS_NX)
 		prot &= ~_REGION_ENTRY_NOEXEC;
-
 	pud = pud_offset(p4d, addr);
 	for (; addr < end; addr = next, pud++) {
 		next = pud_addr_end(addr, end);
-
 		if (!add) {
 			if (pud_none(*pud))
 				continue;
@@ -356,9 +345,9 @@ static int modify_pud_table(p4d_t *p4d, unsigned long addr, unsigned long end,
 			if (!pmd)
 				goto out;
 			pud_populate(&init_mm, pud, pmd);
-		} else if (pud_large(*pud))
+		} else if (pud_large(*pud)) {
 			continue;
-
+		}
 		ret = modify_pmd_table(pud, addr, next, add, direct);
 		if (ret)
 			goto out;
@@ -387,10 +376,10 @@ static void try_free_pud_table(p4d_t *p4d, unsigned long start)
 #endif
 
 	pud = pud_offset(p4d, start);
-	for (i = 0; i < PTRS_PER_PUD; i++, pud++)
+	for (i = 0; i < PTRS_PER_PUD; i++, pud++) {
 		if (!pud_none(*pud))
 			return;
-
+	}
 	vmem_free_pages(p4d_deref(*p4d), CRST_ALLOC_ORDER);
 	p4d_clear(p4d);
 }
@@ -406,7 +395,6 @@ static int modify_p4d_table(pgd_t *pgd, unsigned long addr, unsigned long end,
 	p4d = p4d_offset(pgd, addr);
 	for (; addr < end; addr = next, p4d++) {
 		next = p4d_addr_end(addr, end);
-
 		if (!add) {
 			if (p4d_none(*p4d))
 				continue;
@@ -415,7 +403,6 @@ static int modify_p4d_table(pgd_t *pgd, unsigned long addr, unsigned long end,
 			if (!pud)
 				goto out;
 		}
-
 		ret = modify_pud_table(p4d, addr, next, add, direct);
 		if (ret)
 			goto out;
@@ -442,10 +429,10 @@ static void try_free_p4d_table(pgd_t *pgd, unsigned long start)
 #endif
 
 	p4d = p4d_offset(pgd, start);
-	for (i = 0; i < PTRS_PER_P4D; i++, p4d++)
+	for (i = 0; i < PTRS_PER_P4D; i++, p4d++) {
 		if (!p4d_none(*p4d))
 			return;
-
+	}
 	vmem_free_pages(pgd_deref(*pgd), CRST_ALLOC_ORDER);
 	pgd_clear(pgd);
 }
@@ -460,7 +447,6 @@ static int modify_pagetable(unsigned long start, unsigned long end, bool add,
 
 	if (WARN_ON_ONCE(!PAGE_ALIGNED(start | end)))
 		return -EINVAL;
-
 	for (addr = start; addr < end; addr = next) {
 		next = pgd_addr_end(addr, end);
 		pgd = pgd_offset_k(addr);
@@ -474,7 +460,6 @@ static int modify_pagetable(unsigned long start, unsigned long end, bool add,
 				goto out;
 			pgd_populate(&init_mm, pgd, p4d);
 		}
-
 		ret = modify_p4d_table(pgd, addr, next, add, direct);
 		if (ret)
 			goto out;
@@ -518,7 +503,7 @@ static void vmem_remove_range(unsigned long start, unsigned long size)
  * Add a backed mem_map array to the virtual mem_map array.
  */
 int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node,
-		struct vmem_altmap *altmap)
+			       struct vmem_altmap *altmap)
 {
 	int ret;
 
@@ -532,7 +517,7 @@ int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node,
 }
 
 void vmemmap_free(unsigned long start, unsigned long end,
-		struct vmem_altmap *altmap)
+		  struct vmem_altmap *altmap)
 {
 	mutex_lock(&vmem_mutex);
 	remove_pagetable(start, end, false);

From ed00495333ccc80fc8fb86fb43773c3c2a499466 Mon Sep 17 00:00:00 2001
From: "peterz@infradead.org" <peterz@infradead.org>
Date: Mon, 27 Jul 2020 14:48:52 +0200
Subject: [PATCH 459/502] locking/lockdep: Fix TRACE_IRQFLAGS vs. NMIs

Prior to commit:

  859d069ee1dd ("lockdep: Prepare for NMI IRQ state tracking")

IRQ state tracking was disabled in NMIs due to nmi_enter()
doing lockdep_off() -- with the obvious requirement that NMI entry
call nmi_enter() before trace_hardirqs_off().

[ AFAICT, PowerPC and SH violate this order on their NMI entry ]

However, that commit explicitly changed lockdep_hardirqs_*() to ignore
lockdep_off() and breaks every architecture that has irq-tracing in
it's NMI entry that hasn't been fixed up (x86 being the only fixed one
at this point).

The reason for this change is that by ignoring lockdep_off() we can:

  - get rid of 'current->lockdep_recursion' in lockdep_assert_irqs*()
    which was going to to give header-recursion issues with the
    seqlock rework.

  - allow these lockdep_assert_*() macros to function in NMI context.

Restore the previous state of things and allow an architecture to
opt-in to the NMI IRQ tracking support, however instead of relying on
lockdep_off(), rely on in_nmi(), both are part of nmi_enter() and so
over-all entry ordering doesn't need to change.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Link: https://lore.kernel.org/r/20200727124852.GK119549@hirez.programming.kicks-ass.net
---
 arch/x86/Kconfig.debug   | 3 +++
 kernel/locking/lockdep.c | 8 +++++++-
 lib/Kconfig.debug        | 6 ++++++
 3 files changed, 16 insertions(+), 1 deletion(-)

diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug
index 0dd319e6e5b4..ee1d3c5834c6 100644
--- a/arch/x86/Kconfig.debug
+++ b/arch/x86/Kconfig.debug
@@ -3,6 +3,9 @@
 config TRACE_IRQFLAGS_SUPPORT
 	def_bool y
 
+config TRACE_IRQFLAGS_NMI_SUPPORT
+	def_bool y
+
 config EARLY_PRINTK_USB
 	bool
 
diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
index d595623c4b34..8b0b28b4546b 100644
--- a/kernel/locking/lockdep.c
+++ b/kernel/locking/lockdep.c
@@ -3712,6 +3712,9 @@ void noinstr lockdep_hardirqs_on(unsigned long ip)
 	 * and not rely on hardware state like normal interrupts.
 	 */
 	if (unlikely(in_nmi())) {
+		if (!IS_ENABLED(CONFIG_TRACE_IRQFLAGS_NMI))
+			return;
+
 		/*
 		 * Skip:
 		 *  - recursion check, because NMI can hit lockdep;
@@ -3773,7 +3776,10 @@ void noinstr lockdep_hardirqs_off(unsigned long ip)
 	 * they will restore the software state. This ensures the software
 	 * state is consistent inside NMIs as well.
 	 */
-	if (unlikely(!in_nmi() && (current->lockdep_recursion & LOCKDEP_RECURSION_MASK)))
+	if (in_nmi()) {
+		if (!IS_ENABLED(CONFIG_TRACE_IRQFLAGS_NMI))
+			return;
+	} else if (current->lockdep_recursion & LOCKDEP_RECURSION_MASK)
 		return;
 
 	/*
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index 9ad9210d70a1..fa964b51f066 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -1325,11 +1325,17 @@ config WW_MUTEX_SELFTEST
 endmenu # lock debugging
 
 config TRACE_IRQFLAGS
+	depends on TRACE_IRQFLAGS_SUPPORT
 	bool
 	help
 	  Enables hooks to interrupt enabling and disabling for
 	  either tracing or lock debugging.
 
+config TRACE_IRQFLAGS_NMI
+	def_bool y
+	depends on TRACE_IRQFLAGS
+	depends on TRACE_IRQFLAGS_NMI_SUPPORT
+
 config STACKTRACE
 	bool "Stack backtrace support"
 	depends on STACKTRACE_SUPPORT

From f0c7baca180046824e07fc5f1326e83a8fd150c7 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Fri, 24 Jul 2020 22:44:41 +0200
Subject: [PATCH 460/502] genirq/affinity: Make affinity setting if activated
 opt-in

John reported that on a RK3288 system the perf per CPU interrupts are all
affine to CPU0 and provided the analysis:

 "It looks like what happens is that because the interrupts are not per-CPU
  in the hardware, armpmu_request_irq() calls irq_force_affinity() while
  the interrupt is deactivated and then request_irq() with IRQF_PERCPU |
  IRQF_NOBALANCING.

  Now when irq_startup() runs with IRQ_STARTUP_NORMAL, it calls
  irq_setup_affinity() which returns early because IRQF_PERCPU and
  IRQF_NOBALANCING are set, leaving the interrupt on its original CPU."

This was broken by the recent commit which blocked interrupt affinity
setting in hardware before activation of the interrupt. While this works in
general, it does not work for this particular case. As contrary to the
initial analysis not all interrupt chip drivers implement an activate
callback, the safe cure is to make the deferred interrupt affinity setting
at activation time opt-in.

Implement the necessary core logic and make the two irqchip implementations
for which this is required opt-in. In hindsight this would have been the
right thing to do, but ...

Fixes: baedb87d1b53 ("genirq/affinity: Handle affinity setting on inactive interrupts correctly")
Reported-by: John Keeping <john@metanate.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Tested-by: Marc Zyngier <maz@kernel.org>
Acked-by: Marc Zyngier <maz@kernel.org>
Cc: stable@vger.kernel.org
Link: https://lkml.kernel.org/r/87blk4tzgm.fsf@nanos.tec.linutronix.de
---
 arch/x86/kernel/apic/vector.c    |  4 ++++
 drivers/irqchip/irq-gic-v3-its.c |  5 ++++-
 include/linux/irq.h              | 13 +++++++++++++
 kernel/irq/manage.c              |  6 +++++-
 4 files changed, 26 insertions(+), 2 deletions(-)

diff --git a/arch/x86/kernel/apic/vector.c b/arch/x86/kernel/apic/vector.c
index 7649da2478d8..dae32d948bf2 100644
--- a/arch/x86/kernel/apic/vector.c
+++ b/arch/x86/kernel/apic/vector.c
@@ -560,6 +560,10 @@ static int x86_vector_alloc_irqs(struct irq_domain *domain, unsigned int virq,
 		 * as that can corrupt the affinity move state.
 		 */
 		irqd_set_handle_enforce_irqctx(irqd);
+
+		/* Don't invoke affinity setter on deactivated interrupts */
+		irqd_set_affinity_on_activate(irqd);
+
 		/*
 		 * Legacy vectors are already assigned when the IOAPIC
 		 * takes them over. They stay on the same vector. This is
diff --git a/drivers/irqchip/irq-gic-v3-its.c b/drivers/irqchip/irq-gic-v3-its.c
index beac4caefad9..103d850b5595 100644
--- a/drivers/irqchip/irq-gic-v3-its.c
+++ b/drivers/irqchip/irq-gic-v3-its.c
@@ -3523,6 +3523,7 @@ static int its_irq_domain_alloc(struct irq_domain *domain, unsigned int virq,
 	msi_alloc_info_t *info = args;
 	struct its_device *its_dev = info->scratchpad[0].ptr;
 	struct its_node *its = its_dev->its;
+	struct irq_data *irqd;
 	irq_hw_number_t hwirq;
 	int err;
 	int i;
@@ -3542,7 +3543,9 @@ static int its_irq_domain_alloc(struct irq_domain *domain, unsigned int virq,
 
 		irq_domain_set_hwirq_and_chip(domain, virq + i,
 					      hwirq + i, &its_irq_chip, its_dev);
-		irqd_set_single_target(irq_desc_get_irq_data(irq_to_desc(virq + i)));
+		irqd = irq_get_irq_data(virq + i);
+		irqd_set_single_target(irqd);
+		irqd_set_affinity_on_activate(irqd);
 		pr_debug("ID:%d pID:%d vID:%d\n",
 			 (int)(hwirq + i - its_dev->event_map.lpi_base),
 			 (int)(hwirq + i), virq + i);
diff --git a/include/linux/irq.h b/include/linux/irq.h
index 8d5bc2c237d7..1b7f4dfee35b 100644
--- a/include/linux/irq.h
+++ b/include/linux/irq.h
@@ -213,6 +213,8 @@ struct irq_data {
  *				  required
  * IRQD_HANDLE_ENFORCE_IRQCTX	- Enforce that handle_irq_*() is only invoked
  *				  from actual interrupt context.
+ * IRQD_AFFINITY_ON_ACTIVATE	- Affinity is set on activation. Don't call
+ *				  irq_chip::irq_set_affinity() when deactivated.
  */
 enum {
 	IRQD_TRIGGER_MASK		= 0xf,
@@ -237,6 +239,7 @@ enum {
 	IRQD_CAN_RESERVE		= (1 << 26),
 	IRQD_MSI_NOMASK_QUIRK		= (1 << 27),
 	IRQD_HANDLE_ENFORCE_IRQCTX	= (1 << 28),
+	IRQD_AFFINITY_ON_ACTIVATE	= (1 << 29),
 };
 
 #define __irqd_to_state(d) ACCESS_PRIVATE((d)->common, state_use_accessors)
@@ -421,6 +424,16 @@ static inline bool irqd_msi_nomask_quirk(struct irq_data *d)
 	return __irqd_to_state(d) & IRQD_MSI_NOMASK_QUIRK;
 }
 
+static inline void irqd_set_affinity_on_activate(struct irq_data *d)
+{
+	__irqd_to_state(d) |= IRQD_AFFINITY_ON_ACTIVATE;
+}
+
+static inline bool irqd_affinity_on_activate(struct irq_data *d)
+{
+	return __irqd_to_state(d) & IRQD_AFFINITY_ON_ACTIVATE;
+}
+
 #undef __irqd_to_state
 
 static inline irq_hw_number_t irqd_to_hwirq(struct irq_data *d)
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 2a9fec53e159..48c38e09c673 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -320,12 +320,16 @@ static bool irq_set_affinity_deactivated(struct irq_data *data,
 	struct irq_desc *desc = irq_data_to_desc(data);
 
 	/*
+	 * Handle irq chips which can handle affinity only in activated
+	 * state correctly
+	 *
 	 * If the interrupt is not yet activated, just store the affinity
 	 * mask and do not call the chip driver at all. On activation the
 	 * driver has to make sure anyway that the interrupt is in a
 	 * useable state so startup works.
 	 */
-	if (!IS_ENABLED(CONFIG_IRQ_DOMAIN_HIERARCHY) || irqd_is_activated(data))
+	if (!IS_ENABLED(CONFIG_IRQ_DOMAIN_HIERARCHY) ||
+	    irqd_is_activated(data) || !irqd_affinity_on_activate(data))
 		return false;
 
 	cpumask_copy(desc->irq_common_data.affinity, mask);

From aa251fc5b936d3ddb4b4c4b36427eb9aa3347c82 Mon Sep 17 00:00:00 2001
From: Marc Zyngier <maz@kernel.org>
Date: Sat, 25 Jul 2020 13:30:55 +0100
Subject: [PATCH 461/502] genirq/debugfs: Add missing irqchip flags

Recently introduced irqchip flags lack the corresponding printouts in
debugfs. Add them.

Signed-off-by: Marc Zyngier <maz@kernel.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://lkml.kernel.org/r/874kpvydxc.wl-maz@kernel.org
---
 kernel/irq/debugfs.c | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/kernel/irq/debugfs.c b/kernel/irq/debugfs.c
index 4f9f844074db..b95ff5d5f4bd 100644
--- a/kernel/irq/debugfs.c
+++ b/kernel/irq/debugfs.c
@@ -112,6 +112,7 @@ static const struct irq_bit_descr irqdata_states[] = {
 	BIT_MASK_DESCR(IRQD_AFFINITY_SET),
 	BIT_MASK_DESCR(IRQD_SETAFFINITY_PENDING),
 	BIT_MASK_DESCR(IRQD_AFFINITY_MANAGED),
+	BIT_MASK_DESCR(IRQD_AFFINITY_ON_ACTIVATE),
 	BIT_MASK_DESCR(IRQD_MANAGED_SHUTDOWN),
 	BIT_MASK_DESCR(IRQD_CAN_RESERVE),
 	BIT_MASK_DESCR(IRQD_MSI_NOMASK_QUIRK),
@@ -120,6 +121,10 @@ static const struct irq_bit_descr irqdata_states[] = {
 
 	BIT_MASK_DESCR(IRQD_WAKEUP_STATE),
 	BIT_MASK_DESCR(IRQD_WAKEUP_ARMED),
+
+	BIT_MASK_DESCR(IRQD_DEFAULT_TRIGGER_SET),
+
+	BIT_MASK_DESCR(IRQD_HANDLE_ENFORCE_IRQCTX),
 };
 
 static const struct irq_bit_descr irqdesc_states[] = {

From e885d5d94793ef342e49d55672baabbc16e32bb1 Mon Sep 17 00:00:00 2001
From: Herbert Xu <herbert@gondor.apana.org.au>
Date: Thu, 16 Jul 2020 16:36:50 +1000
Subject: [PATCH 462/502] lockdep: Move list.h inclusion into lockdep.h

Currently lockdep_types.h includes list.h without actually using any
of its macros or functions.  All it needs are the type definitions
which were moved into types.h long ago.  This potentially causes
inclusion loops because both are included by many core header
files.

This patch moves the list.h inclusion into lockdep.h.  Note that
we could probably remove it completely but that could potentially
result in compile failures should any end users not include list.h
directly and also be unlucky enough to not get list.h via some other
header file.

Reported-by: Petr Mladek <pmladek@suse.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Tested-by: Petr Mladek <pmladek@suse.com>
Link: https://lkml.kernel.org/r/20200716063649.GA23065@gondor.apana.org.au
---
 include/linux/lockdep.h       | 1 +
 include/linux/lockdep_types.h | 2 --
 2 files changed, 1 insertion(+), 2 deletions(-)

diff --git a/include/linux/lockdep.h b/include/linux/lockdep.h
index 3b73cf84f77d..b1ad5c045353 100644
--- a/include/linux/lockdep.h
+++ b/include/linux/lockdep.h
@@ -21,6 +21,7 @@ extern int lock_stat;
 #ifdef CONFIG_LOCKDEP
 
 #include <linux/linkage.h>
+#include <linux/list.h>
 #include <linux/debug_locks.h>
 #include <linux/stacktrace.h>
 
diff --git a/include/linux/lockdep_types.h b/include/linux/lockdep_types.h
index 7b9350624577..bb35b449f533 100644
--- a/include/linux/lockdep_types.h
+++ b/include/linux/lockdep_types.h
@@ -32,8 +32,6 @@ enum lockdep_wait_type {
 
 #ifdef CONFIG_LOCKDEP
 
-#include <linux/list.h>
-
 /*
  * We'd rather not expose kernel/lockdep_states.h this wide, but we do need
  * the total number of states... :-(

From 112a0e4171e111e963aada3fe790c71accf4d705 Mon Sep 17 00:00:00 2001
From: Masami Hiramatsu <mhiramat@kernel.org>
Date: Tue, 28 Jul 2020 16:34:00 +0900
Subject: [PATCH 463/502] kprobes: Remove unnecessary module_mutex locking from
 kprobe_optimizer()

Since we already lock both kprobe_mutex and text_mutex in the optimizer,
text will not be changed and the module unloading will be stopped
inside kprobes_module_callback().

The mutex_lock() has originally been introduced to avoid conflict with text modification,
at that point we didn't hold text_mutex.

But after:

  f1c6ece23729 ("kprobes: Fix potential deadlock in kprobe_optimizer()")

We started holding the text_mutex and don't need the modules mutex anyway.

So remove the module_mutex locking.

[ mingo: Amended the changelog. ]

Suggested-by: Ingo Molnar <mingo@kernel.org>
Signed-off-by: Masami Hiramatsu <mhiramat@kernel.org>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Cc: Jarkko Sakkinen <jarkko.sakkinen@linux.intel.com>
Link: https://lore.kernel.org/r/20200728163400.e00b09c594763349f99ce6cb@kernel.org
---
 kernel/kprobes.c | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 146c648eb943..e87679a48ba2 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -598,8 +598,6 @@ static void kprobe_optimizer(struct work_struct *work)
 	mutex_lock(&kprobe_mutex);
 	cpus_read_lock();
 	mutex_lock(&text_mutex);
-	/* Lock modules while optimizing kprobes */
-	mutex_lock(&module_mutex);
 
 	/*
 	 * Step 1: Unoptimize kprobes and collect cleaned (unused and disarmed)
@@ -624,7 +622,6 @@ static void kprobe_optimizer(struct work_struct *work)
 	/* Step 4: Free cleaned kprobes after quiesence period */
 	do_free_cleaned_kprobes();
 
-	mutex_unlock(&module_mutex);
 	mutex_unlock(&text_mutex);
 	cpus_read_unlock();
 

From d903b6d029d66e6478562d75ea18d89098f7b7e8 Mon Sep 17 00:00:00 2001
From: Pu Wen <puwen@hygon.cn>
Date: Mon, 20 Jul 2020 16:22:05 +0800
Subject: [PATCH 464/502] perf/x86/rapl: Add Hygon Fam18h RAPL support

Hygon Family 18h(Dhyana) support RAPL in bit 14 of CPUID 0x80000007 EDX,
and has MSRs RAPL_PWR_UNIT/CORE_ENERGY_STAT/PKG_ENERGY_STAT. So add Hygon
Dhyana Family 18h support for RAPL.

The output is available via the energy-pkg pseudo event:

  $ perf stat -a -I 1000 --per-socket -e power/energy-pkg/

[ mingo: Tidied up the initializers. ]

Signed-off-by: Pu Wen <puwen@hygon.cn>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Link: https://lore.kernel.org/r/20200720082205.1307-1-puwen@hygon.cn
---
 arch/x86/events/rapl.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/arch/x86/events/rapl.c b/arch/x86/events/rapl.c
index 0f2bf59f4354..68b38820b10e 100644
--- a/arch/x86/events/rapl.c
+++ b/arch/x86/events/rapl.c
@@ -787,7 +787,8 @@ static const struct x86_cpu_id rapl_model_match[] __initconst = {
 	X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_X,		&model_hsx),
 	X86_MATCH_INTEL_FAM6_MODEL(COMETLAKE_L,		&model_skl),
 	X86_MATCH_INTEL_FAM6_MODEL(COMETLAKE,		&model_skl),
-	X86_MATCH_VENDOR_FAM(AMD, 0x17, &model_amd_fam17h),
+	X86_MATCH_VENDOR_FAM(AMD,	0x17,		&model_amd_fam17h),
+	X86_MATCH_VENDOR_FAM(HYGON,	0x18,		&model_amd_fam17h),
 	{},
 };
 MODULE_DEVICE_TABLE(x86cpu, rapl_model_match);

From 07d2e59f27cd728e6982b52441673886a6d04267 Mon Sep 17 00:00:00 2001
From: Lorenzo Pieralisi <lorenzo.pieralisi@arm.com>
Date: Fri, 19 Jun 2020 09:20:02 +0100
Subject: [PATCH 465/502] ACPI/IORT: Make iort_match_node_callback walk the
 ACPI namespace for NC

When the iort_match_node_callback is invoked for a named component
the match should be executed upon a device with an ACPI companion.

For devices with no ACPI companion set-up the ACPI device tree must be
walked in order to find the first parent node with a companion set and
check the parent node against the named component entry to check whether
there is a match and therefore an IORT node describing the in/out ID
translation for the device has been found.

Signed-off-by: Lorenzo Pieralisi <lorenzo.pieralisi@arm.com>
Cc: Will Deacon <will@kernel.org>
Cc: Hanjun Guo <guohanjun@huawei.com>
Cc: Sudeep Holla <sudeep.holla@arm.com>
Cc: Robin Murphy <robin.murphy@arm.com>
Cc: "Rafael J. Wysocki" <rjw@rjwysocki.net>
Link: https://lore.kernel.org/r/20200619082013.13661-2-lorenzo.pieralisi@arm.com
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 drivers/acpi/arm64/iort.c | 20 ++++++++++++++++++--
 1 file changed, 18 insertions(+), 2 deletions(-)

diff --git a/drivers/acpi/arm64/iort.c b/drivers/acpi/arm64/iort.c
index 28a6b387e80e..5eee81758184 100644
--- a/drivers/acpi/arm64/iort.c
+++ b/drivers/acpi/arm64/iort.c
@@ -264,15 +264,31 @@ static acpi_status iort_match_node_callback(struct acpi_iort_node *node,
 
 	if (node->type == ACPI_IORT_NODE_NAMED_COMPONENT) {
 		struct acpi_buffer buf = { ACPI_ALLOCATE_BUFFER, NULL };
-		struct acpi_device *adev = to_acpi_device_node(dev->fwnode);
+		struct acpi_device *adev;
 		struct acpi_iort_named_component *ncomp;
+		struct device *nc_dev = dev;
+
+		/*
+		 * Walk the device tree to find a device with an
+		 * ACPI companion; there is no point in scanning
+		 * IORT for a device matching a named component if
+		 * the device does not have an ACPI companion to
+		 * start with.
+		 */
+		do {
+			adev = ACPI_COMPANION(nc_dev);
+			if (adev)
+				break;
+
+			nc_dev = nc_dev->parent;
+		} while (nc_dev);
 
 		if (!adev)
 			goto out;
 
 		status = acpi_get_name(adev->handle, ACPI_FULL_PATHNAME, &buf);
 		if (ACPI_FAILURE(status)) {
-			dev_warn(dev, "Can't get device full path name\n");
+			dev_warn(nc_dev, "Can't get device full path name\n");
 			goto out;
 		}
 

From d1718a1b7a86743b9c517bf9521695ba909c734f Mon Sep 17 00:00:00 2001
From: Lorenzo Pieralisi <lorenzo.pieralisi@arm.com>
Date: Fri, 19 Jun 2020 09:20:03 +0100
Subject: [PATCH 466/502] ACPI/IORT: Make iort_get_device_domain IRQ domain
 agnostic

iort_get_device_domain() is PCI specific but it need not be,
since it can be used to retrieve IRQ domain nexus of any kind
by adding an irq_domain_bus_token input to it.

Make it PCI agnostic by also renaming the requestor ID input
to a more generic ID name.

Signed-off-by: Lorenzo Pieralisi <lorenzo.pieralisi@arm.com>
Acked-by: Bjorn Helgaas <bhelgaas@google.com>   # pci/msi.c
Cc: Will Deacon <will@kernel.org>
Cc: Hanjun Guo <guohanjun@huawei.com>
Cc: Bjorn Helgaas <bhelgaas@google.com>
Cc: Sudeep Holla <sudeep.holla@arm.com>
Cc: Robin Murphy <robin.murphy@arm.com>
Cc: "Rafael J. Wysocki" <rjw@rjwysocki.net>
Link: https://lore.kernel.org/r/20200619082013.13661-3-lorenzo.pieralisi@arm.com
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 drivers/acpi/arm64/iort.c | 14 +++++++-------
 drivers/pci/msi.c         |  3 ++-
 include/linux/acpi_iort.h |  7 ++++---
 3 files changed, 13 insertions(+), 11 deletions(-)

diff --git a/drivers/acpi/arm64/iort.c b/drivers/acpi/arm64/iort.c
index 5eee81758184..902e2aaca946 100644
--- a/drivers/acpi/arm64/iort.c
+++ b/drivers/acpi/arm64/iort.c
@@ -550,7 +550,6 @@ static struct acpi_iort_node *iort_find_dev_node(struct device *dev)
 		node = iort_get_iort_node(dev->fwnode);
 		if (node)
 			return node;
-
 		/*
 		 * if not, then it should be a platform device defined in
 		 * DSDT/SSDT (with Named Component node in IORT)
@@ -641,13 +640,13 @@ static int __maybe_unused iort_find_its_base(u32 its_id, phys_addr_t *base)
 /**
  * iort_dev_find_its_id() - Find the ITS identifier for a device
  * @dev: The device.
- * @req_id: Device's requester ID
+ * @id: Device's ID
  * @idx: Index of the ITS identifier list.
  * @its_id: ITS identifier.
  *
  * Returns: 0 on success, appropriate error value otherwise
  */
-static int iort_dev_find_its_id(struct device *dev, u32 req_id,
+static int iort_dev_find_its_id(struct device *dev, u32 id,
 				unsigned int idx, int *its_id)
 {
 	struct acpi_iort_its_group *its;
@@ -657,7 +656,7 @@ static int iort_dev_find_its_id(struct device *dev, u32 req_id,
 	if (!node)
 		return -ENXIO;
 
-	node = iort_node_map_id(node, req_id, NULL, IORT_MSI_TYPE);
+	node = iort_node_map_id(node, id, NULL, IORT_MSI_TYPE);
 	if (!node)
 		return -ENXIO;
 
@@ -680,19 +679,20 @@ static int iort_dev_find_its_id(struct device *dev, u32 req_id,
  *
  * Returns: the MSI domain for this device, NULL otherwise
  */
-struct irq_domain *iort_get_device_domain(struct device *dev, u32 req_id)
+struct irq_domain *iort_get_device_domain(struct device *dev, u32 id,
+					  enum irq_domain_bus_token bus_token)
 {
 	struct fwnode_handle *handle;
 	int its_id;
 
-	if (iort_dev_find_its_id(dev, req_id, 0, &its_id))
+	if (iort_dev_find_its_id(dev, id, 0, &its_id))
 		return NULL;
 
 	handle = iort_find_domain_token(its_id);
 	if (!handle)
 		return NULL;
 
-	return irq_find_matching_fwnode(handle, DOMAIN_BUS_PCI_MSI);
+	return irq_find_matching_fwnode(handle, bus_token);
 }
 
 static void iort_set_device_domain(struct device *dev,
diff --git a/drivers/pci/msi.c b/drivers/pci/msi.c
index 6b43a5455c7a..74a91f52ecc0 100644
--- a/drivers/pci/msi.c
+++ b/drivers/pci/msi.c
@@ -1558,7 +1558,8 @@ struct irq_domain *pci_msi_get_device_domain(struct pci_dev *pdev)
 	pci_for_each_dma_alias(pdev, get_msi_id_cb, &rid);
 	dom = of_msi_map_get_device_domain(&pdev->dev, rid);
 	if (!dom)
-		dom = iort_get_device_domain(&pdev->dev, rid);
+		dom = iort_get_device_domain(&pdev->dev, rid,
+					     DOMAIN_BUS_PCI_MSI);
 	return dom;
 }
 #endif /* CONFIG_PCI_MSI_IRQ_DOMAIN */
diff --git a/include/linux/acpi_iort.h b/include/linux/acpi_iort.h
index 8e7e2ec37f1b..08ec6bd2297f 100644
--- a/include/linux/acpi_iort.h
+++ b/include/linux/acpi_iort.h
@@ -29,7 +29,8 @@ struct fwnode_handle *iort_find_domain_token(int trans_id);
 #ifdef CONFIG_ACPI_IORT
 void acpi_iort_init(void);
 u32 iort_msi_map_rid(struct device *dev, u32 req_id);
-struct irq_domain *iort_get_device_domain(struct device *dev, u32 req_id);
+struct irq_domain *iort_get_device_domain(struct device *dev, u32 id,
+					  enum irq_domain_bus_token bus_token);
 void acpi_configure_pmsi_domain(struct device *dev);
 int iort_pmsi_get_dev_id(struct device *dev, u32 *dev_id);
 /* IOMMU interface */
@@ -40,8 +41,8 @@ int iort_iommu_msi_get_resv_regions(struct device *dev, struct list_head *head);
 static inline void acpi_iort_init(void) { }
 static inline u32 iort_msi_map_rid(struct device *dev, u32 req_id)
 { return req_id; }
-static inline struct irq_domain *iort_get_device_domain(struct device *dev,
-							u32 req_id)
+static inline struct irq_domain *iort_get_device_domain(
+	struct device *dev, u32 id, enum irq_domain_bus_token bus_token)
 { return NULL; }
 static inline void acpi_configure_pmsi_domain(struct device *dev) { }
 /* IOMMU interface */

From 39c3cf566ceafa7c1ae331a5f26fbb685d670001 Mon Sep 17 00:00:00 2001
From: Lorenzo Pieralisi <lorenzo.pieralisi@arm.com>
Date: Fri, 19 Jun 2020 09:20:04 +0100
Subject: [PATCH 467/502] ACPI/IORT: Make iort_msi_map_rid() PCI agnostic

There is nothing PCI specific in iort_msi_map_rid().

Rename the function using a bus protocol agnostic name,
iort_msi_map_id(), and convert current callers to it.

Signed-off-by: Lorenzo Pieralisi <lorenzo.pieralisi@arm.com>
Acked-by: Bjorn Helgaas <bhelgaas@google.com>
Cc: Will Deacon <will@kernel.org>
Cc: Hanjun Guo <guohanjun@huawei.com>
Cc: Bjorn Helgaas <bhelgaas@google.com>
Cc: Sudeep Holla <sudeep.holla@arm.com>
Cc: Robin Murphy <robin.murphy@arm.com>
Cc: "Rafael J. Wysocki" <rjw@rjwysocki.net>
Link: https://lore.kernel.org/r/20200619082013.13661-4-lorenzo.pieralisi@arm.com
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 drivers/acpi/arm64/iort.c | 12 ++++++------
 drivers/pci/msi.c         |  2 +-
 include/linux/acpi_iort.h |  6 +++---
 3 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/drivers/acpi/arm64/iort.c b/drivers/acpi/arm64/iort.c
index 902e2aaca946..53f9ef515089 100644
--- a/drivers/acpi/arm64/iort.c
+++ b/drivers/acpi/arm64/iort.c
@@ -568,22 +568,22 @@ static struct acpi_iort_node *iort_find_dev_node(struct device *dev)
 }
 
 /**
- * iort_msi_map_rid() - Map a MSI requester ID for a device
+ * iort_msi_map_id() - Map a MSI input ID for a device
  * @dev: The device for which the mapping is to be done.
- * @req_id: The device requester ID.
+ * @input_id: The device input ID.
  *
- * Returns: mapped MSI RID on success, input requester ID otherwise
+ * Returns: mapped MSI ID on success, input ID otherwise
  */
-u32 iort_msi_map_rid(struct device *dev, u32 req_id)
+u32 iort_msi_map_id(struct device *dev, u32 input_id)
 {
 	struct acpi_iort_node *node;
 	u32 dev_id;
 
 	node = iort_find_dev_node(dev);
 	if (!node)
-		return req_id;
+		return input_id;
 
-	iort_node_map_id(node, req_id, &dev_id, IORT_MSI_TYPE);
+	iort_node_map_id(node, input_id, &dev_id, IORT_MSI_TYPE);
 	return dev_id;
 }
 
diff --git a/drivers/pci/msi.c b/drivers/pci/msi.c
index 74a91f52ecc0..77f48b95e277 100644
--- a/drivers/pci/msi.c
+++ b/drivers/pci/msi.c
@@ -1536,7 +1536,7 @@ u32 pci_msi_domain_get_msi_rid(struct irq_domain *domain, struct pci_dev *pdev)
 
 	of_node = irq_domain_get_of_node(domain);
 	rid = of_node ? of_msi_map_rid(&pdev->dev, of_node, rid) :
-			iort_msi_map_rid(&pdev->dev, rid);
+			iort_msi_map_id(&pdev->dev, rid);
 
 	return rid;
 }
diff --git a/include/linux/acpi_iort.h b/include/linux/acpi_iort.h
index 08ec6bd2297f..e51425e083da 100644
--- a/include/linux/acpi_iort.h
+++ b/include/linux/acpi_iort.h
@@ -28,7 +28,7 @@ void iort_deregister_domain_token(int trans_id);
 struct fwnode_handle *iort_find_domain_token(int trans_id);
 #ifdef CONFIG_ACPI_IORT
 void acpi_iort_init(void);
-u32 iort_msi_map_rid(struct device *dev, u32 req_id);
+u32 iort_msi_map_id(struct device *dev, u32 id);
 struct irq_domain *iort_get_device_domain(struct device *dev, u32 id,
 					  enum irq_domain_bus_token bus_token);
 void acpi_configure_pmsi_domain(struct device *dev);
@@ -39,8 +39,8 @@ const struct iommu_ops *iort_iommu_configure(struct device *dev);
 int iort_iommu_msi_get_resv_regions(struct device *dev, struct list_head *head);
 #else
 static inline void acpi_iort_init(void) { }
-static inline u32 iort_msi_map_rid(struct device *dev, u32 req_id)
-{ return req_id; }
+static inline u32 iort_msi_map_id(struct device *dev, u32 id)
+{ return id; }
 static inline struct irq_domain *iort_get_device_domain(
 	struct device *dev, u32 id, enum irq_domain_bus_token bus_token)
 { return NULL; }

From 3a3d208beede7ae03f8c80bed01f47d6b98d4ceb Mon Sep 17 00:00:00 2001
From: Lorenzo Pieralisi <lorenzo.pieralisi@arm.com>
Date: Fri, 19 Jun 2020 09:20:05 +0100
Subject: [PATCH 468/502] ACPI/IORT: Remove useless PCI bus walk

The PCI bus domain number (used in the iort_match_node_callback() -
pci_domain_nr() call) is cascaded through the PCI bus hierarchy at PCI
bus enumeration time, therefore there is no need in iort_find_dev_node()
to walk the PCI bus upwards to grab the root bus to be passed to
iort_scan_node(), the device->bus PCI bus pointer will do.

Remove this useless code.

Signed-off-by: Lorenzo Pieralisi <lorenzo.pieralisi@arm.com>
Cc: Will Deacon <will@kernel.org>
Cc: Hanjun Guo <guohanjun@huawei.com>
Cc: Sudeep Holla <sudeep.holla@arm.com>
Cc: Robin Murphy <robin.murphy@arm.com>
Cc: "Rafael J. Wysocki" <rjw@rjwysocki.net>
Link: https://lore.kernel.org/r/20200619082013.13661-5-lorenzo.pieralisi@arm.com
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 drivers/acpi/arm64/iort.c | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/drivers/acpi/arm64/iort.c b/drivers/acpi/arm64/iort.c
index 53f9ef515089..421c6976ab81 100644
--- a/drivers/acpi/arm64/iort.c
+++ b/drivers/acpi/arm64/iort.c
@@ -558,10 +558,7 @@ static struct acpi_iort_node *iort_find_dev_node(struct device *dev)
 				      iort_match_node_callback, dev);
 	}
 
-	/* Find a PCI root bus */
 	pbus = to_pci_dev(dev)->bus;
-	while (!pci_is_root_bus(pbus))
-		pbus = pbus->parent;
 
 	return iort_scan_node(ACPI_IORT_NODE_PCI_ROOT_COMPLEX,
 			      iort_match_node_callback, &pbus->dev);

From b8e069a2a8da02137605ba585837a3a0c45df01a Mon Sep 17 00:00:00 2001
From: Lorenzo Pieralisi <lorenzo.pieralisi@arm.com>
Date: Fri, 19 Jun 2020 09:20:06 +0100
Subject: [PATCH 469/502] ACPI/IORT: Add an input ID to acpi_dma_configure()

Some HW devices are created as child devices of proprietary busses,
that have a bus specific policy defining how the child devices
wires representing the devices ID are translated into IOMMU and
IRQ controllers device IDs.

Current IORT code provides translations for:

- PCI devices, where the device ID is well identified at bus level
  as the requester ID (RID)
- Platform devices that are endpoint devices where the device ID is
  retrieved from the ACPI object IORT mappings (Named components single
  mappings). A platform device is represented in IORT as a named
  component node

For devices that are child devices of proprietary busses the IORT
firmware represents the bus node as a named component node in IORT
and it is up to that named component node to define in/out bus
specific ID translations for the bus child devices that are
allocated and created in a bus specific manner.

In order to make IORT ID translations available for proprietary
bus child devices, the current ACPI (and IORT) code must be
augmented to provide an additional ID parameter to acpi_dma_configure()
representing the child devices input ID. This ID is bus specific
and it is retrieved in bus specific code.

By adding an ID parameter to acpi_dma_configure(), the IORT
code can map the child device ID to an IOMMU stream ID through
the IORT named component representing the bus in/out ID mappings.

Signed-off-by: Lorenzo Pieralisi <lorenzo.pieralisi@arm.com>
Cc: Will Deacon <will@kernel.org>
Cc: Hanjun Guo <guohanjun@huawei.com>
Cc: Sudeep Holla <sudeep.holla@arm.com>
Cc: Robin Murphy <robin.murphy@arm.com>
Cc: "Rafael J. Wysocki" <rjw@rjwysocki.net>
Link: https://lore.kernel.org/r/20200619082013.13661-6-lorenzo.pieralisi@arm.com
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 drivers/acpi/arm64/iort.c | 59 +++++++++++++++++++++++++++++----------
 drivers/acpi/scan.c       |  8 ++++--
 include/acpi/acpi_bus.h   |  9 ++++--
 include/linux/acpi.h      |  7 +++++
 include/linux/acpi_iort.h |  7 +++--
 5 files changed, 67 insertions(+), 23 deletions(-)

diff --git a/drivers/acpi/arm64/iort.c b/drivers/acpi/arm64/iort.c
index 421c6976ab81..ec782e4a0fe4 100644
--- a/drivers/acpi/arm64/iort.c
+++ b/drivers/acpi/arm64/iort.c
@@ -978,19 +978,54 @@ static void iort_named_component_init(struct device *dev,
 					   nc->node_flags);
 }
 
+static int iort_nc_iommu_map(struct device *dev, struct acpi_iort_node *node)
+{
+	struct acpi_iort_node *parent;
+	int err = -ENODEV, i = 0;
+	u32 streamid = 0;
+
+	do {
+
+		parent = iort_node_map_platform_id(node, &streamid,
+						   IORT_IOMMU_TYPE,
+						   i++);
+
+		if (parent)
+			err = iort_iommu_xlate(dev, parent, streamid);
+	} while (parent && !err);
+
+	return err;
+}
+
+static int iort_nc_iommu_map_id(struct device *dev,
+				struct acpi_iort_node *node,
+				const u32 *in_id)
+{
+	struct acpi_iort_node *parent;
+	u32 streamid;
+
+	parent = iort_node_map_id(node, *in_id, &streamid, IORT_IOMMU_TYPE);
+	if (parent)
+		return iort_iommu_xlate(dev, parent, streamid);
+
+	return -ENODEV;
+}
+
+
 /**
- * iort_iommu_configure - Set-up IOMMU configuration for a device.
+ * iort_iommu_configure_id - Set-up IOMMU configuration for a device.
  *
  * @dev: device to configure
+ * @id_in: optional input id const value pointer
  *
  * Returns: iommu_ops pointer on configuration success
  *          NULL on configuration failure
  */
-const struct iommu_ops *iort_iommu_configure(struct device *dev)
+const struct iommu_ops *iort_iommu_configure_id(struct device *dev,
+						const u32 *id_in)
 {
-	struct acpi_iort_node *node, *parent;
+	struct acpi_iort_node *node;
 	const struct iommu_ops *ops;
-	u32 streamid = 0;
 	int err = -ENODEV;
 
 	/*
@@ -1019,21 +1054,13 @@ const struct iommu_ops *iort_iommu_configure(struct device *dev)
 		if (fwspec && iort_pci_rc_supports_ats(node))
 			fwspec->flags |= IOMMU_FWSPEC_PCI_RC_ATS;
 	} else {
-		int i = 0;
-
 		node = iort_scan_node(ACPI_IORT_NODE_NAMED_COMPONENT,
 				      iort_match_node_callback, dev);
 		if (!node)
 			return NULL;
 
-		do {
-			parent = iort_node_map_platform_id(node, &streamid,
-							   IORT_IOMMU_TYPE,
-							   i++);
-
-			if (parent)
-				err = iort_iommu_xlate(dev, parent, streamid);
-		} while (parent && !err);
+		err = id_in ? iort_nc_iommu_map_id(dev, node, id_in) :
+			      iort_nc_iommu_map(dev, node);
 
 		if (!err)
 			iort_named_component_init(dev, node);
@@ -1058,6 +1085,7 @@ const struct iommu_ops *iort_iommu_configure(struct device *dev)
 
 	return ops;
 }
+
 #else
 static inline const struct iommu_ops *iort_fwspec_iommu_ops(struct device *dev)
 { return NULL; }
@@ -1066,7 +1094,8 @@ static inline int iort_add_device_replay(const struct iommu_ops *ops,
 { return 0; }
 int iort_iommu_msi_get_resv_regions(struct device *dev, struct list_head *head)
 { return 0; }
-const struct iommu_ops *iort_iommu_configure(struct device *dev)
+const struct iommu_ops *iort_iommu_configure_id(struct device *dev,
+						const u32 *input_id)
 { return NULL; }
 #endif
 
diff --git a/drivers/acpi/scan.c b/drivers/acpi/scan.c
index 8777faced51a..2142f1554761 100644
--- a/drivers/acpi/scan.c
+++ b/drivers/acpi/scan.c
@@ -1457,8 +1457,10 @@ int acpi_dma_get_range(struct device *dev, u64 *dma_addr, u64 *offset,
  * acpi_dma_configure - Set-up DMA configuration for the device.
  * @dev: The pointer to the device
  * @attr: device dma attributes
+ * @input_id: input device id const value pointer
  */
-int acpi_dma_configure(struct device *dev, enum dev_dma_attr attr)
+int acpi_dma_configure_id(struct device *dev, enum dev_dma_attr attr,
+			  const u32 *input_id)
 {
 	const struct iommu_ops *iommu;
 	u64 dma_addr = 0, size = 0;
@@ -1470,7 +1472,7 @@ int acpi_dma_configure(struct device *dev, enum dev_dma_attr attr)
 
 	iort_dma_setup(dev, &dma_addr, &size);
 
-	iommu = iort_iommu_configure(dev);
+	iommu = iort_iommu_configure_id(dev, input_id);
 	if (PTR_ERR(iommu) == -EPROBE_DEFER)
 		return -EPROBE_DEFER;
 
@@ -1479,7 +1481,7 @@ int acpi_dma_configure(struct device *dev, enum dev_dma_attr attr)
 
 	return 0;
 }
-EXPORT_SYMBOL_GPL(acpi_dma_configure);
+EXPORT_SYMBOL_GPL(acpi_dma_configure_id);
 
 static void acpi_init_coherency(struct acpi_device *adev)
 {
diff --git a/include/acpi/acpi_bus.h b/include/acpi/acpi_bus.h
index 5afb6ceb284f..a3abcc4b7d9f 100644
--- a/include/acpi/acpi_bus.h
+++ b/include/acpi/acpi_bus.h
@@ -588,8 +588,13 @@ bool acpi_dma_supported(struct acpi_device *adev);
 enum dev_dma_attr acpi_get_dma_attr(struct acpi_device *adev);
 int acpi_dma_get_range(struct device *dev, u64 *dma_addr, u64 *offset,
 		       u64 *size);
-int acpi_dma_configure(struct device *dev, enum dev_dma_attr attr);
-
+int acpi_dma_configure_id(struct device *dev, enum dev_dma_attr attr,
+			   const u32 *input_id);
+static inline int acpi_dma_configure(struct device *dev,
+				     enum dev_dma_attr attr)
+{
+	return acpi_dma_configure_id(dev, attr, NULL);
+}
 struct acpi_device *acpi_find_child_device(struct acpi_device *parent,
 					   u64 address, bool check_children);
 int acpi_is_root_bridge(acpi_handle);
diff --git a/include/linux/acpi.h b/include/linux/acpi.h
index d661cd0ee64d..6d2c47489d90 100644
--- a/include/linux/acpi.h
+++ b/include/linux/acpi.h
@@ -905,6 +905,13 @@ static inline int acpi_dma_configure(struct device *dev,
 	return 0;
 }
 
+static inline int acpi_dma_configure_id(struct device *dev,
+					enum dev_dma_attr attr,
+					const u32 *input_id)
+{
+	return 0;
+}
+
 #define ACPI_PTR(_ptr)	(NULL)
 
 static inline void acpi_device_set_enumerated(struct acpi_device *adev)
diff --git a/include/linux/acpi_iort.h b/include/linux/acpi_iort.h
index e51425e083da..20a32120bb88 100644
--- a/include/linux/acpi_iort.h
+++ b/include/linux/acpi_iort.h
@@ -35,7 +35,8 @@ void acpi_configure_pmsi_domain(struct device *dev);
 int iort_pmsi_get_dev_id(struct device *dev, u32 *dev_id);
 /* IOMMU interface */
 void iort_dma_setup(struct device *dev, u64 *dma_addr, u64 *size);
-const struct iommu_ops *iort_iommu_configure(struct device *dev);
+const struct iommu_ops *iort_iommu_configure_id(struct device *dev,
+						const u32 *id_in);
 int iort_iommu_msi_get_resv_regions(struct device *dev, struct list_head *head);
 #else
 static inline void acpi_iort_init(void) { }
@@ -48,8 +49,8 @@ static inline void acpi_configure_pmsi_domain(struct device *dev) { }
 /* IOMMU interface */
 static inline void iort_dma_setup(struct device *dev, u64 *dma_addr,
 				  u64 *size) { }
-static inline const struct iommu_ops *iort_iommu_configure(
-				      struct device *dev)
+static inline const struct iommu_ops *iort_iommu_configure_id(
+				      struct device *dev, const u32 *id_in)
 { return NULL; }
 static inline
 int iort_iommu_msi_get_resv_regions(struct device *dev, struct list_head *head)

From 746a71d02b5d15817fcb13c956ba999a87773952 Mon Sep 17 00:00:00 2001
From: Lorenzo Pieralisi <lorenzo.pieralisi@arm.com>
Date: Fri, 19 Jun 2020 09:20:07 +0100
Subject: [PATCH 470/502] of/iommu: Make of_map_rid() PCI agnostic

There is nothing PCI specific (other than the RID - requester ID)
in the of_map_rid() implementation, so the same function can be
reused for input/output IDs mapping for other busses just as well.

Rename the RID instances/names to a generic "id" tag.

No functionality change intended.

Signed-off-by: Lorenzo Pieralisi <lorenzo.pieralisi@arm.com>
Reviewed-by: Rob Herring <robh@kernel.org>
Acked-by: Joerg Roedel <jroedel@suse.de>
Cc: Rob Herring <robh+dt@kernel.org>
Cc: Joerg Roedel <joro@8bytes.org>
Cc: Robin Murphy <robin.murphy@arm.com>
Cc: Marc Zyngier <maz@kernel.org>
Link: https://lore.kernel.org/r/20200619082013.13661-7-lorenzo.pieralisi@arm.com
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 drivers/iommu/of_iommu.c |  4 ++--
 drivers/of/base.c        | 42 ++++++++++++++++++++--------------------
 drivers/of/irq.c         |  2 +-
 include/linux/of.h       |  4 ++--
 4 files changed, 26 insertions(+), 26 deletions(-)

diff --git a/drivers/iommu/of_iommu.c b/drivers/iommu/of_iommu.c
index 20738aacac89..016316244737 100644
--- a/drivers/iommu/of_iommu.c
+++ b/drivers/iommu/of_iommu.c
@@ -129,7 +129,7 @@ static int of_pci_iommu_init(struct pci_dev *pdev, u16 alias, void *data)
 	struct of_phandle_args iommu_spec = { .args_count = 1 };
 	int err;
 
-	err = of_map_rid(info->np, alias, "iommu-map", "iommu-map-mask",
+	err = of_map_id(info->np, alias, "iommu-map", "iommu-map-mask",
 			 &iommu_spec.np, iommu_spec.args);
 	if (err)
 		return err == -ENODEV ? NO_IOMMU : err;
@@ -145,7 +145,7 @@ static int of_fsl_mc_iommu_init(struct fsl_mc_device *mc_dev,
 	struct of_phandle_args iommu_spec = { .args_count = 1 };
 	int err;
 
-	err = of_map_rid(master_np, mc_dev->icid, "iommu-map",
+	err = of_map_id(master_np, mc_dev->icid, "iommu-map",
 			 "iommu-map-mask", &iommu_spec.np,
 			 iommu_spec.args);
 	if (err)
diff --git a/drivers/of/base.c b/drivers/of/base.c
index ae03b1218b06..ea44fea99813 100644
--- a/drivers/of/base.c
+++ b/drivers/of/base.c
@@ -2201,15 +2201,15 @@ int of_find_last_cache_level(unsigned int cpu)
 }
 
 /**
- * of_map_rid - Translate a requester ID through a downstream mapping.
+ * of_map_id - Translate an ID through a downstream mapping.
  * @np: root complex device node.
- * @rid: device requester ID to map.
+ * @id: device ID to map.
  * @map_name: property name of the map to use.
  * @map_mask_name: optional property name of the mask to use.
  * @target: optional pointer to a target device node.
  * @id_out: optional pointer to receive the translated ID.
  *
- * Given a device requester ID, look up the appropriate implementation-defined
+ * Given a device ID, look up the appropriate implementation-defined
  * platform ID and/or the target device which receives transactions on that
  * ID, as per the "iommu-map" and "msi-map" bindings. Either of @target or
  * @id_out may be NULL if only the other is required. If @target points to
@@ -2219,11 +2219,11 @@ int of_find_last_cache_level(unsigned int cpu)
  *
  * Return: 0 on success or a standard error code on failure.
  */
-int of_map_rid(struct device_node *np, u32 rid,
+int of_map_id(struct device_node *np, u32 id,
 	       const char *map_name, const char *map_mask_name,
 	       struct device_node **target, u32 *id_out)
 {
-	u32 map_mask, masked_rid;
+	u32 map_mask, masked_id;
 	int map_len;
 	const __be32 *map = NULL;
 
@@ -2235,7 +2235,7 @@ int of_map_rid(struct device_node *np, u32 rid,
 		if (target)
 			return -ENODEV;
 		/* Otherwise, no map implies no translation */
-		*id_out = rid;
+		*id_out = id;
 		return 0;
 	}
 
@@ -2255,22 +2255,22 @@ int of_map_rid(struct device_node *np, u32 rid,
 	if (map_mask_name)
 		of_property_read_u32(np, map_mask_name, &map_mask);
 
-	masked_rid = map_mask & rid;
+	masked_id = map_mask & id;
 	for ( ; map_len > 0; map_len -= 4 * sizeof(*map), map += 4) {
 		struct device_node *phandle_node;
-		u32 rid_base = be32_to_cpup(map + 0);
+		u32 id_base = be32_to_cpup(map + 0);
 		u32 phandle = be32_to_cpup(map + 1);
 		u32 out_base = be32_to_cpup(map + 2);
-		u32 rid_len = be32_to_cpup(map + 3);
+		u32 id_len = be32_to_cpup(map + 3);
 
-		if (rid_base & ~map_mask) {
-			pr_err("%pOF: Invalid %s translation - %s-mask (0x%x) ignores rid-base (0x%x)\n",
+		if (id_base & ~map_mask) {
+			pr_err("%pOF: Invalid %s translation - %s-mask (0x%x) ignores id-base (0x%x)\n",
 				np, map_name, map_name,
-				map_mask, rid_base);
+				map_mask, id_base);
 			return -EFAULT;
 		}
 
-		if (masked_rid < rid_base || masked_rid >= rid_base + rid_len)
+		if (masked_id < id_base || masked_id >= id_base + id_len)
 			continue;
 
 		phandle_node = of_find_node_by_phandle(phandle);
@@ -2288,20 +2288,20 @@ int of_map_rid(struct device_node *np, u32 rid,
 		}
 
 		if (id_out)
-			*id_out = masked_rid - rid_base + out_base;
+			*id_out = masked_id - id_base + out_base;
 
-		pr_debug("%pOF: %s, using mask %08x, rid-base: %08x, out-base: %08x, length: %08x, rid: %08x -> %08x\n",
-			np, map_name, map_mask, rid_base, out_base,
-			rid_len, rid, masked_rid - rid_base + out_base);
+		pr_debug("%pOF: %s, using mask %08x, id-base: %08x, out-base: %08x, length: %08x, id: %08x -> %08x\n",
+			np, map_name, map_mask, id_base, out_base,
+			id_len, id, masked_id - id_base + out_base);
 		return 0;
 	}
 
-	pr_info("%pOF: no %s translation for rid 0x%x on %pOF\n", np, map_name,
-		rid, target && *target ? *target : NULL);
+	pr_info("%pOF: no %s translation for id 0x%x on %pOF\n", np, map_name,
+		id, target && *target ? *target : NULL);
 
 	/* Bypasses translation */
 	if (id_out)
-		*id_out = rid;
+		*id_out = id;
 	return 0;
 }
-EXPORT_SYMBOL_GPL(of_map_rid);
+EXPORT_SYMBOL_GPL(of_map_id);
diff --git a/drivers/of/irq.c b/drivers/of/irq.c
index a296eaf52a5b..d632bc5b3a2d 100644
--- a/drivers/of/irq.c
+++ b/drivers/of/irq.c
@@ -587,7 +587,7 @@ static u32 __of_msi_map_rid(struct device *dev, struct device_node **np,
 	 * "msi-map" property.
 	 */
 	for (parent_dev = dev; parent_dev; parent_dev = parent_dev->parent)
-		if (!of_map_rid(parent_dev->of_node, rid_in, "msi-map",
+		if (!of_map_id(parent_dev->of_node, rid_in, "msi-map",
 				"msi-map-mask", np, &rid_out))
 			break;
 	return rid_out;
diff --git a/include/linux/of.h b/include/linux/of.h
index c669c0a4732f..60abe3f636ad 100644
--- a/include/linux/of.h
+++ b/include/linux/of.h
@@ -554,7 +554,7 @@ bool of_console_check(struct device_node *dn, char *name, int index);
 
 extern int of_cpu_node_to_id(struct device_node *np);
 
-int of_map_rid(struct device_node *np, u32 rid,
+int of_map_id(struct device_node *np, u32 id,
 	       const char *map_name, const char *map_mask_name,
 	       struct device_node **target, u32 *id_out);
 
@@ -978,7 +978,7 @@ static inline int of_cpu_node_to_id(struct device_node *np)
 	return -ENODEV;
 }
 
-static inline int of_map_rid(struct device_node *np, u32 rid,
+static inline int of_map_id(struct device_node *np, u32 id,
 			     const char *map_name, const char *map_mask_name,
 			     struct device_node **target, u32 *id_out)
 {

From a081bd4af4ce80d845a0bab355ab5d0822db8058 Mon Sep 17 00:00:00 2001
From: Lorenzo Pieralisi <lorenzo.pieralisi@arm.com>
Date: Fri, 19 Jun 2020 09:20:08 +0100
Subject: [PATCH 471/502] of/device: Add input id to of_dma_configure()

Devices sitting on proprietary busses have a device ID space that
is owned by the respective bus and related firmware bindings. In order
to let the generic OF layer handle the input translations to
an IOMMU id, for such busses the current of_dma_configure() interface
should be extended in order to allow the bus layer to provide the
device input id parameter - that is retrieved/assigned in bus
specific code and firmware.

Augment of_dma_configure() to add an optional input_id parameter,
leaving current functionality unchanged.

Signed-off-by: Lorenzo Pieralisi <lorenzo.pieralisi@arm.com>
Reviewed-by: Rob Herring <robh@kernel.org>
Cc: Rob Herring <robh+dt@kernel.org>
Cc: Robin Murphy <robin.murphy@arm.com>
Cc: Joerg Roedel <joro@8bytes.org>
Cc: Laurentiu Tudor <laurentiu.tudor@nxp.com>
Link: https://lore.kernel.org/r/20200619082013.13661-8-lorenzo.pieralisi@arm.com
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 drivers/bus/fsl-mc/fsl-mc-bus.c |  4 +-
 drivers/iommu/of_iommu.c        | 85 ++++++++++++++++++---------------
 drivers/of/device.c             |  8 ++--
 include/linux/of_device.h       | 16 ++++++-
 include/linux/of_iommu.h        |  6 ++-
 5 files changed, 72 insertions(+), 47 deletions(-)

diff --git a/drivers/bus/fsl-mc/fsl-mc-bus.c b/drivers/bus/fsl-mc/fsl-mc-bus.c
index 40526da5c6a6..8ead3f0238f2 100644
--- a/drivers/bus/fsl-mc/fsl-mc-bus.c
+++ b/drivers/bus/fsl-mc/fsl-mc-bus.c
@@ -118,11 +118,13 @@ static int fsl_mc_bus_uevent(struct device *dev, struct kobj_uevent_env *env)
 static int fsl_mc_dma_configure(struct device *dev)
 {
 	struct device *dma_dev = dev;
+	struct fsl_mc_device *mc_dev = to_fsl_mc_device(dev);
+	u32 input_id = mc_dev->icid;
 
 	while (dev_is_fsl_mc(dma_dev))
 		dma_dev = dma_dev->parent;
 
-	return of_dma_configure(dev, dma_dev->of_node, 0);
+	return of_dma_configure_id(dev, dma_dev->of_node, 0, &input_id);
 }
 
 static ssize_t modalias_show(struct device *dev, struct device_attribute *attr,
diff --git a/drivers/iommu/of_iommu.c b/drivers/iommu/of_iommu.c
index 016316244737..e505b9130a1c 100644
--- a/drivers/iommu/of_iommu.c
+++ b/drivers/iommu/of_iommu.c
@@ -118,6 +118,43 @@ static int of_iommu_xlate(struct device *dev,
 	return ret;
 }
 
+static int of_iommu_configure_dev_id(struct device_node *master_np,
+				     struct device *dev,
+				     const u32 *id)
+{
+	struct of_phandle_args iommu_spec = { .args_count = 1 };
+	int err;
+
+	err = of_map_id(master_np, *id, "iommu-map",
+			 "iommu-map-mask", &iommu_spec.np,
+			 iommu_spec.args);
+	if (err)
+		return err == -ENODEV ? NO_IOMMU : err;
+
+	err = of_iommu_xlate(dev, &iommu_spec);
+	of_node_put(iommu_spec.np);
+	return err;
+}
+
+static int of_iommu_configure_dev(struct device_node *master_np,
+				  struct device *dev)
+{
+	struct of_phandle_args iommu_spec;
+	int err = NO_IOMMU, idx = 0;
+
+	while (!of_parse_phandle_with_args(master_np, "iommus",
+					   "#iommu-cells",
+					   idx, &iommu_spec)) {
+		err = of_iommu_xlate(dev, &iommu_spec);
+		of_node_put(iommu_spec.np);
+		idx++;
+		if (err)
+			break;
+	}
+
+	return err;
+}
+
 struct of_pci_iommu_alias_info {
 	struct device *dev;
 	struct device_node *np;
@@ -126,38 +163,21 @@ struct of_pci_iommu_alias_info {
 static int of_pci_iommu_init(struct pci_dev *pdev, u16 alias, void *data)
 {
 	struct of_pci_iommu_alias_info *info = data;
-	struct of_phandle_args iommu_spec = { .args_count = 1 };
-	int err;
+	u32 input_id = alias;
 
-	err = of_map_id(info->np, alias, "iommu-map", "iommu-map-mask",
-			 &iommu_spec.np, iommu_spec.args);
-	if (err)
-		return err == -ENODEV ? NO_IOMMU : err;
-
-	err = of_iommu_xlate(info->dev, &iommu_spec);
-	of_node_put(iommu_spec.np);
-	return err;
+	return of_iommu_configure_dev_id(info->np, info->dev, &input_id);
 }
 
-static int of_fsl_mc_iommu_init(struct fsl_mc_device *mc_dev,
-				struct device_node *master_np)
+static int of_iommu_configure_device(struct device_node *master_np,
+				     struct device *dev, const u32 *id)
 {
-	struct of_phandle_args iommu_spec = { .args_count = 1 };
-	int err;
-
-	err = of_map_id(master_np, mc_dev->icid, "iommu-map",
-			 "iommu-map-mask", &iommu_spec.np,
-			 iommu_spec.args);
-	if (err)
-		return err == -ENODEV ? NO_IOMMU : err;
-
-	err = of_iommu_xlate(&mc_dev->dev, &iommu_spec);
-	of_node_put(iommu_spec.np);
-	return err;
+	return (id) ? of_iommu_configure_dev_id(master_np, dev, id) :
+		      of_iommu_configure_dev(master_np, dev);
 }
 
 const struct iommu_ops *of_iommu_configure(struct device *dev,
-					   struct device_node *master_np)
+					   struct device_node *master_np,
+					   const u32 *id)
 {
 	const struct iommu_ops *ops = NULL;
 	struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev);
@@ -188,21 +208,8 @@ const struct iommu_ops *of_iommu_configure(struct device *dev,
 		pci_request_acs();
 		err = pci_for_each_dma_alias(to_pci_dev(dev),
 					     of_pci_iommu_init, &info);
-	} else if (dev_is_fsl_mc(dev)) {
-		err = of_fsl_mc_iommu_init(to_fsl_mc_device(dev), master_np);
 	} else {
-		struct of_phandle_args iommu_spec;
-		int idx = 0;
-
-		while (!of_parse_phandle_with_args(master_np, "iommus",
-						   "#iommu-cells",
-						   idx, &iommu_spec)) {
-			err = of_iommu_xlate(dev, &iommu_spec);
-			of_node_put(iommu_spec.np);
-			idx++;
-			if (err)
-				break;
-		}
+		err = of_iommu_configure_device(master_np, dev, id);
 
 		fwspec = dev_iommu_fwspec_get(dev);
 		if (!err && fwspec)
diff --git a/drivers/of/device.c b/drivers/of/device.c
index 27203bfd0b22..b439c1e05434 100644
--- a/drivers/of/device.c
+++ b/drivers/of/device.c
@@ -78,6 +78,7 @@ int of_device_add(struct platform_device *ofdev)
  * @np:		Pointer to OF node having DMA configuration
  * @force_dma:  Whether device is to be set up by of_dma_configure() even if
  *		DMA capability is not explicitly described by firmware.
+ * @id:		Optional const pointer value input id
  *
  * Try to get devices's DMA configuration from DT and update it
  * accordingly.
@@ -86,7 +87,8 @@ int of_device_add(struct platform_device *ofdev)
  * can use a platform bus notifier and handle BUS_NOTIFY_ADD_DEVICE events
  * to fix up DMA configuration.
  */
-int of_dma_configure(struct device *dev, struct device_node *np, bool force_dma)
+int of_dma_configure_id(struct device *dev, struct device_node *np,
+			bool force_dma, const u32 *id)
 {
 	u64 dma_addr, paddr, size = 0;
 	int ret;
@@ -160,7 +162,7 @@ int of_dma_configure(struct device *dev, struct device_node *np, bool force_dma)
 	dev_dbg(dev, "device is%sdma coherent\n",
 		coherent ? " " : " not ");
 
-	iommu = of_iommu_configure(dev, np);
+	iommu = of_iommu_configure(dev, np, id);
 	if (PTR_ERR(iommu) == -EPROBE_DEFER)
 		return -EPROBE_DEFER;
 
@@ -171,7 +173,7 @@ int of_dma_configure(struct device *dev, struct device_node *np, bool force_dma)
 
 	return 0;
 }
-EXPORT_SYMBOL_GPL(of_dma_configure);
+EXPORT_SYMBOL_GPL(of_dma_configure_id);
 
 int of_device_register(struct platform_device *pdev)
 {
diff --git a/include/linux/of_device.h b/include/linux/of_device.h
index 8d31e39dd564..07ca187fc5e4 100644
--- a/include/linux/of_device.h
+++ b/include/linux/of_device.h
@@ -55,9 +55,15 @@ static inline struct device_node *of_cpu_device_node_get(int cpu)
 	return of_node_get(cpu_dev->of_node);
 }
 
-int of_dma_configure(struct device *dev,
+int of_dma_configure_id(struct device *dev,
 		     struct device_node *np,
-		     bool force_dma);
+		     bool force_dma, const u32 *id);
+static inline int of_dma_configure(struct device *dev,
+				   struct device_node *np,
+				   bool force_dma)
+{
+	return of_dma_configure_id(dev, np, force_dma, NULL);
+}
 #else /* CONFIG_OF */
 
 static inline int of_driver_match_device(struct device *dev,
@@ -106,6 +112,12 @@ static inline struct device_node *of_cpu_device_node_get(int cpu)
 	return NULL;
 }
 
+static inline int of_dma_configure_id(struct device *dev,
+				   struct device_node *np,
+				   bool force_dma)
+{
+	return 0;
+}
 static inline int of_dma_configure(struct device *dev,
 				   struct device_node *np,
 				   bool force_dma)
diff --git a/include/linux/of_iommu.h b/include/linux/of_iommu.h
index f3d40dd7bb66..16f4b3e87f20 100644
--- a/include/linux/of_iommu.h
+++ b/include/linux/of_iommu.h
@@ -13,7 +13,8 @@ extern int of_get_dma_window(struct device_node *dn, const char *prefix,
 			     size_t *size);
 
 extern const struct iommu_ops *of_iommu_configure(struct device *dev,
-					struct device_node *master_np);
+					struct device_node *master_np,
+					const u32 *id);
 
 #else
 
@@ -25,7 +26,8 @@ static inline int of_get_dma_window(struct device_node *dn, const char *prefix,
 }
 
 static inline const struct iommu_ops *of_iommu_configure(struct device *dev,
-					 struct device_node *master_np)
+					 struct device_node *master_np,
+					 const u32 *id)
 {
 	return NULL;
 }

From 5bda70c6162de9536cc983eacd24261c9c5de596 Mon Sep 17 00:00:00 2001
From: Laurentiu Tudor <laurentiu.tudor@nxp.com>
Date: Fri, 19 Jun 2020 09:20:09 +0100
Subject: [PATCH 472/502] dt-bindings: arm: fsl: Add msi-map device-tree
 binding for fsl-mc bus

The existing bindings cannot be used to specify the relationship
between fsl-mc devices and GIC ITSes.
Add a generic binding for mapping fsl-mc devices to GIC ITSes, using
msi-map property.
In addition, deprecate msi-parent property which no longer makes sense
now that we support translating the MSIs.

Signed-off-by: Laurentiu Tudor <laurentiu.tudor@nxp.com>
Signed-off-by: Diana Craciun <diana.craciun@oss.nxp.com>
Reviewed-by: Rob Herring <robh@kernel.org>
Cc: Rob Herring <robh+dt@kernel.org>
Link: https://lore.kernel.org/r/20200619082013.13661-9-lorenzo.pieralisi@arm.com
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 .../devicetree/bindings/misc/fsl,qoriq-mc.txt | 50 ++++++++++++++++---
 1 file changed, 44 insertions(+), 6 deletions(-)

diff --git a/Documentation/devicetree/bindings/misc/fsl,qoriq-mc.txt b/Documentation/devicetree/bindings/misc/fsl,qoriq-mc.txt
index 9134e9bcca56..ebd329181c14 100644
--- a/Documentation/devicetree/bindings/misc/fsl,qoriq-mc.txt
+++ b/Documentation/devicetree/bindings/misc/fsl,qoriq-mc.txt
@@ -28,6 +28,16 @@ Documentation/devicetree/bindings/iommu/iommu.txt.
 For arm-smmu binding, see:
 Documentation/devicetree/bindings/iommu/arm,smmu.yaml.
 
+The MSI writes are accompanied by sideband data which is derived from the ICID.
+The msi-map property is used to associate the devices with both the ITS
+controller and the sideband data which accompanies the writes.
+
+For generic MSI bindings, see
+Documentation/devicetree/bindings/interrupt-controller/msi.txt.
+
+For GICv3 and GIC ITS bindings, see:
+Documentation/devicetree/bindings/interrupt-controller/arm,gic-v3.yaml.
+
 Required properties:
 
     - compatible
@@ -49,11 +59,6 @@ Required properties:
                         region may not be present in some scenarios, such
                         as in the device tree presented to a virtual machine.
 
-    - msi-parent
-        Value type: <phandle>
-        Definition: Must be present and point to the MSI controller node
-                    handling message interrupts for the MC.
-
     - ranges
         Value type: <prop-encoded-array>
         Definition: A standard property.  Defines the mapping between the child
@@ -119,6 +124,28 @@ Optional properties:
   associated with the listed IOMMU, with the iommu-specifier
   (i - icid-base + iommu-base).
 
+- msi-map: Maps an ICID to a GIC ITS and associated msi-specifier
+  data.
+
+  The property is an arbitrary number of tuples of
+  (icid-base,gic-its,msi-base,length).
+
+  Any ICID in the interval [icid-base, icid-base + length) is
+  associated with the listed GIC ITS, with the msi-specifier
+  (i - icid-base + msi-base).
+
+Deprecated properties:
+
+    - msi-parent
+        Value type: <phandle>
+        Definition: Describes the MSI controller node handling message
+                    interrupts for the MC. When there is no translation
+                    between the ICID and deviceID this property can be used
+                    to describe the MSI controller used by the devices on the
+                    mc-bus.
+                    The use of this property for mc-bus is deprecated. Please
+                    use msi-map.
+
 Example:
 
         smmu: iommu@5000000 {
@@ -128,13 +155,24 @@ Example:
                ...
         };
 
+        gic: interrupt-controller@6000000 {
+               compatible = "arm,gic-v3";
+               ...
+        }
+        its: gic-its@6020000 {
+               compatible = "arm,gic-v3-its";
+               msi-controller;
+               ...
+        };
+
         fsl_mc: fsl-mc@80c000000 {
                 compatible = "fsl,qoriq-mc";
                 reg = <0x00000008 0x0c000000 0 0x40>,    /* MC portal base */
                       <0x00000000 0x08340000 0 0x40000>; /* MC control reg */
-                msi-parent = <&its>;
                 /* define map for ICIDs 23-64 */
                 iommu-map = <23 &smmu 23 41>;
+                /* define msi map for ICIDs 23-64 */
+                msi-map = <23 &its 23 41>;
                 #address-cells = <3>;
                 #size-cells = <1>;
 

From 6f881aba01109a01a43e4f135673c19190f61133 Mon Sep 17 00:00:00 2001
From: Diana Craciun <diana.craciun@oss.nxp.com>
Date: Fri, 19 Jun 2020 09:20:10 +0100
Subject: [PATCH 473/502] of/irq: make of_msi_map_get_device_domain() bus
 agnostic

of_msi_map_get_device_domain() is PCI specific but it need not be and
can be easily changed to be bus agnostic in order to be used by other
busses by adding an IRQ domain bus token as an input parameter.

Signed-off-by: Diana Craciun <diana.craciun@oss.nxp.com>
Signed-off-by: Lorenzo Pieralisi <lorenzo.pieralisi@arm.com>
Reviewed-by: Rob Herring <robh@kernel.org>
Acked-by: Bjorn Helgaas <bhelgaas@google.com>   # pci/msi.c
Cc: Bjorn Helgaas <bhelgaas@google.com>
Cc: Rob Herring <robh+dt@kernel.org>
Cc: Marc Zyngier <maz@kernel.org>
Link: https://lore.kernel.org/r/20200619082013.13661-10-lorenzo.pieralisi@arm.com
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 drivers/of/irq.c       | 8 +++++---
 drivers/pci/msi.c      | 2 +-
 include/linux/of_irq.h | 5 +++--
 3 files changed, 9 insertions(+), 6 deletions(-)

diff --git a/drivers/of/irq.c b/drivers/of/irq.c
index d632bc5b3a2d..1005e4f349ef 100644
--- a/drivers/of/irq.c
+++ b/drivers/of/irq.c
@@ -613,18 +613,20 @@ u32 of_msi_map_rid(struct device *dev, struct device_node *msi_np, u32 rid_in)
  * of_msi_map_get_device_domain - Use msi-map to find the relevant MSI domain
  * @dev: device for which the mapping is to be done.
  * @rid: Requester ID for the device.
+ * @bus_token: Bus token
  *
  * Walk up the device hierarchy looking for devices with a "msi-map"
  * property.
  *
  * Returns: the MSI domain for this device (or NULL on failure)
  */
-struct irq_domain *of_msi_map_get_device_domain(struct device *dev, u32 rid)
+struct irq_domain *of_msi_map_get_device_domain(struct device *dev, u32 id,
+						u32 bus_token)
 {
 	struct device_node *np = NULL;
 
-	__of_msi_map_rid(dev, &np, rid);
-	return irq_find_matching_host(np, DOMAIN_BUS_PCI_MSI);
+	__of_msi_map_rid(dev, &np, id);
+	return irq_find_matching_host(np, bus_token);
 }
 
 /**
diff --git a/drivers/pci/msi.c b/drivers/pci/msi.c
index 77f48b95e277..b4bfe0b03b2d 100644
--- a/drivers/pci/msi.c
+++ b/drivers/pci/msi.c
@@ -1556,7 +1556,7 @@ struct irq_domain *pci_msi_get_device_domain(struct pci_dev *pdev)
 	u32 rid = pci_dev_id(pdev);
 
 	pci_for_each_dma_alias(pdev, get_msi_id_cb, &rid);
-	dom = of_msi_map_get_device_domain(&pdev->dev, rid);
+	dom = of_msi_map_get_device_domain(&pdev->dev, rid, DOMAIN_BUS_PCI_MSI);
 	if (!dom)
 		dom = iort_get_device_domain(&pdev->dev, rid,
 					     DOMAIN_BUS_PCI_MSI);
diff --git a/include/linux/of_irq.h b/include/linux/of_irq.h
index 1214cabb2247..7142a3722758 100644
--- a/include/linux/of_irq.h
+++ b/include/linux/of_irq.h
@@ -52,7 +52,8 @@ extern struct irq_domain *of_msi_get_domain(struct device *dev,
 					    struct device_node *np,
 					    enum irq_domain_bus_token token);
 extern struct irq_domain *of_msi_map_get_device_domain(struct device *dev,
-						       u32 rid);
+							u32 id,
+							u32 bus_token);
 extern void of_msi_configure(struct device *dev, struct device_node *np);
 u32 of_msi_map_rid(struct device *dev, struct device_node *msi_np, u32 rid_in);
 #else
@@ -85,7 +86,7 @@ static inline struct irq_domain *of_msi_get_domain(struct device *dev,
 	return NULL;
 }
 static inline struct irq_domain *of_msi_map_get_device_domain(struct device *dev,
-							      u32 rid)
+						u32 id, u32 bus_token)
 {
 	return NULL;
 }

From 2bcdd8f2c07f1aa1bfd34fa0dab8e06949e34846 Mon Sep 17 00:00:00 2001
From: Lorenzo Pieralisi <lorenzo.pieralisi@arm.com>
Date: Fri, 19 Jun 2020 09:20:11 +0100
Subject: [PATCH 474/502] of/irq: Make of_msi_map_rid() PCI bus agnostic

There is nothing PCI bus specific in the of_msi_map_rid()
implementation other than the requester ID tag for the input
ID space. Rename requester ID to a more generic ID so that
the translation code can be used by all busses that require
input/output ID translations.

No functional change intended.

Signed-off-by: Lorenzo Pieralisi <lorenzo.pieralisi@arm.com>
Reviewed-by: Rob Herring <robh@kernel.org>
Cc: Bjorn Helgaas <bhelgaas@google.com>
Cc: Rob Herring <robh+dt@kernel.org>
Cc: Marc Zyngier <maz@kernel.org>
Link: https://lore.kernel.org/r/20200619082013.13661-11-lorenzo.pieralisi@arm.com
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 drivers/of/irq.c       | 28 ++++++++++++++--------------
 drivers/pci/msi.c      |  2 +-
 include/linux/of_irq.h |  8 ++++----
 3 files changed, 19 insertions(+), 19 deletions(-)

diff --git a/drivers/of/irq.c b/drivers/of/irq.c
index 1005e4f349ef..25d17b8a1a1a 100644
--- a/drivers/of/irq.c
+++ b/drivers/of/irq.c
@@ -576,43 +576,43 @@ err:
 	}
 }
 
-static u32 __of_msi_map_rid(struct device *dev, struct device_node **np,
-			    u32 rid_in)
+static u32 __of_msi_map_id(struct device *dev, struct device_node **np,
+			    u32 id_in)
 {
 	struct device *parent_dev;
-	u32 rid_out = rid_in;
+	u32 id_out = id_in;
 
 	/*
 	 * Walk up the device parent links looking for one with a
 	 * "msi-map" property.
 	 */
 	for (parent_dev = dev; parent_dev; parent_dev = parent_dev->parent)
-		if (!of_map_id(parent_dev->of_node, rid_in, "msi-map",
-				"msi-map-mask", np, &rid_out))
+		if (!of_map_id(parent_dev->of_node, id_in, "msi-map",
+				"msi-map-mask", np, &id_out))
 			break;
-	return rid_out;
+	return id_out;
 }
 
 /**
- * of_msi_map_rid - Map a MSI requester ID for a device.
+ * of_msi_map_id - Map a MSI ID for a device.
  * @dev: device for which the mapping is to be done.
  * @msi_np: device node of the expected msi controller.
- * @rid_in: unmapped MSI requester ID for the device.
+ * @id_in: unmapped MSI ID for the device.
  *
  * Walk up the device hierarchy looking for devices with a "msi-map"
- * property.  If found, apply the mapping to @rid_in.
+ * property.  If found, apply the mapping to @id_in.
  *
- * Returns the mapped MSI requester ID.
+ * Returns the mapped MSI ID.
  */
-u32 of_msi_map_rid(struct device *dev, struct device_node *msi_np, u32 rid_in)
+u32 of_msi_map_id(struct device *dev, struct device_node *msi_np, u32 id_in)
 {
-	return __of_msi_map_rid(dev, &msi_np, rid_in);
+	return __of_msi_map_id(dev, &msi_np, id_in);
 }
 
 /**
  * of_msi_map_get_device_domain - Use msi-map to find the relevant MSI domain
  * @dev: device for which the mapping is to be done.
- * @rid: Requester ID for the device.
+ * @id: Device ID.
  * @bus_token: Bus token
  *
  * Walk up the device hierarchy looking for devices with a "msi-map"
@@ -625,7 +625,7 @@ struct irq_domain *of_msi_map_get_device_domain(struct device *dev, u32 id,
 {
 	struct device_node *np = NULL;
 
-	__of_msi_map_rid(dev, &np, id);
+	__of_msi_map_id(dev, &np, id);
 	return irq_find_matching_host(np, bus_token);
 }
 
diff --git a/drivers/pci/msi.c b/drivers/pci/msi.c
index b4bfe0b03b2d..19aeadb22f11 100644
--- a/drivers/pci/msi.c
+++ b/drivers/pci/msi.c
@@ -1535,7 +1535,7 @@ u32 pci_msi_domain_get_msi_rid(struct irq_domain *domain, struct pci_dev *pdev)
 	pci_for_each_dma_alias(pdev, get_msi_id_cb, &rid);
 
 	of_node = irq_domain_get_of_node(domain);
-	rid = of_node ? of_msi_map_rid(&pdev->dev, of_node, rid) :
+	rid = of_node ? of_msi_map_id(&pdev->dev, of_node, rid) :
 			iort_msi_map_id(&pdev->dev, rid);
 
 	return rid;
diff --git a/include/linux/of_irq.h b/include/linux/of_irq.h
index 7142a3722758..e8b78139f78c 100644
--- a/include/linux/of_irq.h
+++ b/include/linux/of_irq.h
@@ -55,7 +55,7 @@ extern struct irq_domain *of_msi_map_get_device_domain(struct device *dev,
 							u32 id,
 							u32 bus_token);
 extern void of_msi_configure(struct device *dev, struct device_node *np);
-u32 of_msi_map_rid(struct device *dev, struct device_node *msi_np, u32 rid_in);
+u32 of_msi_map_id(struct device *dev, struct device_node *msi_np, u32 id_in);
 #else
 static inline int of_irq_count(struct device_node *dev)
 {
@@ -93,10 +93,10 @@ static inline struct irq_domain *of_msi_map_get_device_domain(struct device *dev
 static inline void of_msi_configure(struct device *dev, struct device_node *np)
 {
 }
-static inline u32 of_msi_map_rid(struct device *dev,
-				 struct device_node *msi_np, u32 rid_in)
+static inline u32 of_msi_map_id(struct device *dev,
+				 struct device_node *msi_np, u32 id_in)
 {
-	return rid_in;
+	return id_in;
 }
 #endif
 

From 998fb7badf0362a2057694878098642ef363d899 Mon Sep 17 00:00:00 2001
From: Diana Craciun <diana.craciun@oss.nxp.com>
Date: Fri, 19 Jun 2020 09:20:12 +0100
Subject: [PATCH 475/502] bus/fsl-mc: Refactor the MSI domain creation in the
 DPRC driver

The DPRC driver is not taking into account the msi-map property
and assumes that the icid is the same as the stream ID. Although
this assumption is correct, generalize the code to include a
translation between icid and streamID.

Furthermore do not just copy the MSI domain from parent (for child
containers), but use the information provided by the msi-map property.

If the msi-map property is missing from the device tree retain the old
behaviour for backward compatibility ie the child DPRC objects
inherit the MSI domain from the parent.

Signed-off-by: Diana Craciun <diana.craciun@oss.nxp.com>
Acked-by: Marc Zyngier <maz@kernel.org>
Link: https://lore.kernel.org/r/20200619082013.13661-12-lorenzo.pieralisi@arm.com
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 drivers/bus/fsl-mc/dprc-driver.c            | 31 ++++++---------------
 drivers/bus/fsl-mc/fsl-mc-bus.c             |  4 +--
 drivers/bus/fsl-mc/fsl-mc-msi.c             | 29 +++++++++++--------
 drivers/bus/fsl-mc/fsl-mc-private.h         |  6 ++--
 drivers/irqchip/irq-gic-v3-its-fsl-mc-msi.c | 15 +++++++++-
 5 files changed, 46 insertions(+), 39 deletions(-)

diff --git a/drivers/bus/fsl-mc/dprc-driver.c b/drivers/bus/fsl-mc/dprc-driver.c
index c8b1c3842c1a..189bff2115a8 100644
--- a/drivers/bus/fsl-mc/dprc-driver.c
+++ b/drivers/bus/fsl-mc/dprc-driver.c
@@ -592,6 +592,7 @@ static int dprc_probe(struct fsl_mc_device *mc_dev)
 	bool mc_io_created = false;
 	bool msi_domain_set = false;
 	u16 major_ver, minor_ver;
+	struct irq_domain *mc_msi_domain;
 
 	if (!is_fsl_mc_bus_dprc(mc_dev))
 		return -EINVAL;
@@ -621,31 +622,15 @@ static int dprc_probe(struct fsl_mc_device *mc_dev)
 			return error;
 
 		mc_io_created = true;
+	}
 
-		/*
-		 * Inherit parent MSI domain:
-		 */
-		dev_set_msi_domain(&mc_dev->dev,
-				   dev_get_msi_domain(parent_dev));
-		msi_domain_set = true;
+	mc_msi_domain = fsl_mc_find_msi_domain(&mc_dev->dev);
+	if (!mc_msi_domain) {
+		dev_warn(&mc_dev->dev,
+			 "WARNING: MC bus without interrupt support\n");
 	} else {
-		/*
-		 * This is a root DPRC
-		 */
-		struct irq_domain *mc_msi_domain;
-
-		if (dev_is_fsl_mc(parent_dev))
-			return -EINVAL;
-
-		error = fsl_mc_find_msi_domain(parent_dev,
-					       &mc_msi_domain);
-		if (error < 0) {
-			dev_warn(&mc_dev->dev,
-				 "WARNING: MC bus without interrupt support\n");
-		} else {
-			dev_set_msi_domain(&mc_dev->dev, mc_msi_domain);
-			msi_domain_set = true;
-		}
+		dev_set_msi_domain(&mc_dev->dev, mc_msi_domain);
+		msi_domain_set = true;
 	}
 
 	error = dprc_open(mc_dev->mc_io, 0, mc_dev->obj_desc.id,
diff --git a/drivers/bus/fsl-mc/fsl-mc-bus.c b/drivers/bus/fsl-mc/fsl-mc-bus.c
index 8ead3f0238f2..824ff77bbe86 100644
--- a/drivers/bus/fsl-mc/fsl-mc-bus.c
+++ b/drivers/bus/fsl-mc/fsl-mc-bus.c
@@ -370,8 +370,8 @@ EXPORT_SYMBOL_GPL(fsl_mc_get_version);
 /**
  * fsl_mc_get_root_dprc - function to traverse to the root dprc
  */
-static void fsl_mc_get_root_dprc(struct device *dev,
-				 struct device **root_dprc_dev)
+void fsl_mc_get_root_dprc(struct device *dev,
+			 struct device **root_dprc_dev)
 {
 	if (!dev) {
 		*root_dprc_dev = NULL;
diff --git a/drivers/bus/fsl-mc/fsl-mc-msi.c b/drivers/bus/fsl-mc/fsl-mc-msi.c
index 8b9c66d7c4ff..e7bbff445a83 100644
--- a/drivers/bus/fsl-mc/fsl-mc-msi.c
+++ b/drivers/bus/fsl-mc/fsl-mc-msi.c
@@ -177,23 +177,30 @@ struct irq_domain *fsl_mc_msi_create_irq_domain(struct fwnode_handle *fwnode,
 	return domain;
 }
 
-int fsl_mc_find_msi_domain(struct device *mc_platform_dev,
-			   struct irq_domain **mc_msi_domain)
+struct irq_domain *fsl_mc_find_msi_domain(struct device *dev)
 {
-	struct irq_domain *msi_domain;
-	struct device_node *mc_of_node = mc_platform_dev->of_node;
+	struct irq_domain *msi_domain = NULL;
+	struct fsl_mc_device *mc_dev = to_fsl_mc_device(dev);
 
-	msi_domain = of_msi_get_domain(mc_platform_dev, mc_of_node,
-				       DOMAIN_BUS_FSL_MC_MSI);
+	msi_domain = of_msi_map_get_device_domain(dev, mc_dev->icid,
+						  DOMAIN_BUS_FSL_MC_MSI);
+
+	/*
+	 * if the msi-map property is missing assume that all the
+	 * child containers inherit the domain from the parent
+	 */
 	if (!msi_domain) {
-		pr_err("Unable to find fsl-mc MSI domain for %pOF\n",
-		       mc_of_node);
+		struct device *root_dprc_dev;
+		struct device *bus_dev;
 
-		return -ENOENT;
+		fsl_mc_get_root_dprc(dev, &root_dprc_dev);
+		bus_dev = root_dprc_dev->parent;
+		msi_domain = of_msi_get_domain(bus_dev,
+					       bus_dev->of_node,
+					       DOMAIN_BUS_FSL_MC_MSI);
 	}
 
-	*mc_msi_domain = msi_domain;
-	return 0;
+	return msi_domain;
 }
 
 static void fsl_mc_msi_free_descs(struct device *dev)
diff --git a/drivers/bus/fsl-mc/fsl-mc-private.h b/drivers/bus/fsl-mc/fsl-mc-private.h
index 21ca8c756ee7..7a46a12eb747 100644
--- a/drivers/bus/fsl-mc/fsl-mc-private.h
+++ b/drivers/bus/fsl-mc/fsl-mc-private.h
@@ -595,8 +595,7 @@ int fsl_mc_msi_domain_alloc_irqs(struct device *dev,
 
 void fsl_mc_msi_domain_free_irqs(struct device *dev);
 
-int fsl_mc_find_msi_domain(struct device *mc_platform_dev,
-			   struct irq_domain **mc_msi_domain);
+struct irq_domain *fsl_mc_find_msi_domain(struct device *dev);
 
 int fsl_mc_populate_irq_pool(struct fsl_mc_bus *mc_bus,
 			     unsigned int irq_count);
@@ -613,6 +612,9 @@ void fsl_destroy_mc_io(struct fsl_mc_io *mc_io);
 
 bool fsl_mc_is_root_dprc(struct device *dev);
 
+void fsl_mc_get_root_dprc(struct device *dev,
+			 struct device **root_dprc_dev);
+
 struct fsl_mc_device *fsl_mc_device_lookup(struct fsl_mc_obj_desc *obj_desc,
 					   struct fsl_mc_device *mc_bus_dev);
 
diff --git a/drivers/irqchip/irq-gic-v3-its-fsl-mc-msi.c b/drivers/irqchip/irq-gic-v3-its-fsl-mc-msi.c
index 606efa64adff..a5c8d577e424 100644
--- a/drivers/irqchip/irq-gic-v3-its-fsl-mc-msi.c
+++ b/drivers/irqchip/irq-gic-v3-its-fsl-mc-msi.c
@@ -23,6 +23,18 @@ static struct irq_chip its_msi_irq_chip = {
 	.irq_set_affinity = msi_domain_set_affinity
 };
 
+static u32 fsl_mc_msi_domain_get_msi_id(struct irq_domain *domain,
+					struct fsl_mc_device *mc_dev)
+{
+	struct device_node *of_node;
+	u32 out_id;
+
+	of_node = irq_domain_get_of_node(domain);
+	out_id = of_msi_map_id(&mc_dev->dev, of_node, mc_dev->icid);
+
+	return out_id;
+}
+
 static int its_fsl_mc_msi_prepare(struct irq_domain *msi_domain,
 				  struct device *dev,
 				  int nvec, msi_alloc_info_t *info)
@@ -43,7 +55,8 @@ static int its_fsl_mc_msi_prepare(struct irq_domain *msi_domain,
 	 * NOTE: This device id corresponds to the IOMMU stream ID
 	 * associated with the DPRC object (ICID).
 	 */
-	info->scratchpad[0].ul = mc_bus_dev->icid;
+	info->scratchpad[0].ul = fsl_mc_msi_domain_get_msi_id(msi_domain,
+							      mc_bus_dev);
 	msi_info = msi_get_domain_info(msi_domain->parent);
 
 	/* Allocate at least 32 MSIs, and always as a power of 2 */

From 6305166c8771c33a8d5992fb53f93cfecedc14fd Mon Sep 17 00:00:00 2001
From: Makarand Pawagi <makarand.pawagi@nxp.com>
Date: Fri, 19 Jun 2020 09:20:13 +0100
Subject: [PATCH 476/502] bus: fsl-mc: Add ACPI support for fsl-mc

Add ACPI support in the fsl-mc driver. Driver parses MC DSDT table to
extract memory and other resources.

Interrupt (GIC ITS) information is extracted from the MADT table
by drivers/irqchip/irq-gic-v3-its-fsl-mc-msi.c.

IORT table is parsed to configure DMA.

Signed-off-by: Makarand Pawagi <makarand.pawagi@nxp.com>
Signed-off-by: Diana Craciun <diana.craciun@oss.nxp.com>
Signed-off-by: Laurentiu Tudor <laurentiu.tudor@nxp.com>
Link: https://lore.kernel.org/r/20200619082013.13661-13-lorenzo.pieralisi@arm.com
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 drivers/bus/fsl-mc/fsl-mc-bus.c             | 73 ++++++++++++----
 drivers/bus/fsl-mc/fsl-mc-msi.c             | 35 ++++----
 drivers/irqchip/irq-gic-v3-its-fsl-mc-msi.c | 92 ++++++++++++++++-----
 3 files changed, 149 insertions(+), 51 deletions(-)

diff --git a/drivers/bus/fsl-mc/fsl-mc-bus.c b/drivers/bus/fsl-mc/fsl-mc-bus.c
index 824ff77bbe86..324d49d6df89 100644
--- a/drivers/bus/fsl-mc/fsl-mc-bus.c
+++ b/drivers/bus/fsl-mc/fsl-mc-bus.c
@@ -18,6 +18,8 @@
 #include <linux/bitops.h>
 #include <linux/msi.h>
 #include <linux/dma-mapping.h>
+#include <linux/acpi.h>
+#include <linux/iommu.h>
 
 #include "fsl-mc-private.h"
 
@@ -38,6 +40,7 @@ struct fsl_mc {
 	struct fsl_mc_device *root_mc_bus_dev;
 	u8 num_translation_ranges;
 	struct fsl_mc_addr_translation_range *translation_ranges;
+	void *fsl_mc_regs;
 };
 
 /**
@@ -56,6 +59,10 @@ struct fsl_mc_addr_translation_range {
 	phys_addr_t start_phys_addr;
 };
 
+#define FSL_MC_FAPR	0x28
+#define MC_FAPR_PL	BIT(18)
+#define MC_FAPR_BMT	BIT(17)
+
 /**
  * fsl_mc_bus_match - device to driver matching callback
  * @dev: the fsl-mc device to match against
@@ -124,7 +131,10 @@ static int fsl_mc_dma_configure(struct device *dev)
 	while (dev_is_fsl_mc(dma_dev))
 		dma_dev = dma_dev->parent;
 
-	return of_dma_configure_id(dev, dma_dev->of_node, 0, &input_id);
+	if (dev_of_node(dma_dev))
+		return of_dma_configure_id(dev, dma_dev->of_node, 0, &input_id);
+
+	return acpi_dma_configure_id(dev, DEV_DMA_COHERENT, &input_id);
 }
 
 static ssize_t modalias_show(struct device *dev, struct device_attribute *attr,
@@ -865,8 +875,11 @@ static int fsl_mc_bus_probe(struct platform_device *pdev)
 	struct fsl_mc_io *mc_io = NULL;
 	int container_id;
 	phys_addr_t mc_portal_phys_addr;
-	u32 mc_portal_size;
-	struct resource res;
+	u32 mc_portal_size, mc_stream_id;
+	struct resource *plat_res;
+
+	if (!iommu_present(&fsl_mc_bus_type))
+		return -EPROBE_DEFER;
 
 	mc = devm_kzalloc(&pdev->dev, sizeof(*mc), GFP_KERNEL);
 	if (!mc)
@@ -874,19 +887,33 @@ static int fsl_mc_bus_probe(struct platform_device *pdev)
 
 	platform_set_drvdata(pdev, mc);
 
+	plat_res = platform_get_resource(pdev, IORESOURCE_MEM, 1);
+	mc->fsl_mc_regs = devm_ioremap_resource(&pdev->dev, plat_res);
+	if (IS_ERR(mc->fsl_mc_regs))
+		return PTR_ERR(mc->fsl_mc_regs);
+
+	if (IS_ENABLED(CONFIG_ACPI) && !dev_of_node(&pdev->dev)) {
+		mc_stream_id = readl(mc->fsl_mc_regs + FSL_MC_FAPR);
+		/*
+		 * HW ORs the PL and BMT bit, places the result in bit 15 of
+		 * the StreamID and ORs in the ICID. Calculate it accordingly.
+		 */
+		mc_stream_id = (mc_stream_id & 0xffff) |
+				((mc_stream_id & (MC_FAPR_PL | MC_FAPR_BMT)) ?
+					0x4000 : 0);
+		error = acpi_dma_configure_id(&pdev->dev, DEV_DMA_COHERENT,
+					      &mc_stream_id);
+		if (error)
+			dev_warn(&pdev->dev, "failed to configure dma: %d.\n",
+				 error);
+	}
+
 	/*
 	 * Get physical address of MC portal for the root DPRC:
 	 */
-	error = of_address_to_resource(pdev->dev.of_node, 0, &res);
-	if (error < 0) {
-		dev_err(&pdev->dev,
-			"of_address_to_resource() failed for %pOF\n",
-			pdev->dev.of_node);
-		return error;
-	}
-
-	mc_portal_phys_addr = res.start;
-	mc_portal_size = resource_size(&res);
+	plat_res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
+	mc_portal_phys_addr = plat_res->start;
+	mc_portal_size = resource_size(plat_res);
 	error = fsl_create_mc_io(&pdev->dev, mc_portal_phys_addr,
 				 mc_portal_size, NULL,
 				 FSL_MC_IO_ATOMIC_CONTEXT_PORTAL, &mc_io);
@@ -903,11 +930,13 @@ static int fsl_mc_bus_probe(struct platform_device *pdev)
 	dev_info(&pdev->dev, "MC firmware version: %u.%u.%u\n",
 		 mc_version.major, mc_version.minor, mc_version.revision);
 
-	error = get_mc_addr_translation_ranges(&pdev->dev,
-					       &mc->translation_ranges,
-					       &mc->num_translation_ranges);
-	if (error < 0)
-		goto error_cleanup_mc_io;
+	if (dev_of_node(&pdev->dev)) {
+		error = get_mc_addr_translation_ranges(&pdev->dev,
+						&mc->translation_ranges,
+						&mc->num_translation_ranges);
+		if (error < 0)
+			goto error_cleanup_mc_io;
+	}
 
 	error = dprc_get_container_id(mc_io, 0, &container_id);
 	if (error < 0) {
@@ -934,6 +963,7 @@ static int fsl_mc_bus_probe(struct platform_device *pdev)
 		goto error_cleanup_mc_io;
 
 	mc->root_mc_bus_dev = mc_bus_dev;
+	mc_bus_dev->dev.fwnode = pdev->dev.fwnode;
 	return 0;
 
 error_cleanup_mc_io:
@@ -967,11 +997,18 @@ static const struct of_device_id fsl_mc_bus_match_table[] = {
 
 MODULE_DEVICE_TABLE(of, fsl_mc_bus_match_table);
 
+static const struct acpi_device_id fsl_mc_bus_acpi_match_table[] = {
+	{"NXP0008", 0 },
+	{ }
+};
+MODULE_DEVICE_TABLE(acpi, fsl_mc_bus_acpi_match_table);
+
 static struct platform_driver fsl_mc_bus_driver = {
 	.driver = {
 		   .name = "fsl_mc_bus",
 		   .pm = NULL,
 		   .of_match_table = fsl_mc_bus_match_table,
+		   .acpi_match_table = fsl_mc_bus_acpi_match_table,
 		   },
 	.probe = fsl_mc_bus_probe,
 	.remove = fsl_mc_bus_remove,
diff --git a/drivers/bus/fsl-mc/fsl-mc-msi.c b/drivers/bus/fsl-mc/fsl-mc-msi.c
index e7bbff445a83..8edadf05cbb7 100644
--- a/drivers/bus/fsl-mc/fsl-mc-msi.c
+++ b/drivers/bus/fsl-mc/fsl-mc-msi.c
@@ -13,6 +13,7 @@
 #include <linux/irq.h>
 #include <linux/irqdomain.h>
 #include <linux/msi.h>
+#include <linux/acpi_iort.h>
 
 #include "fsl-mc-private.h"
 
@@ -179,25 +180,31 @@ struct irq_domain *fsl_mc_msi_create_irq_domain(struct fwnode_handle *fwnode,
 
 struct irq_domain *fsl_mc_find_msi_domain(struct device *dev)
 {
-	struct irq_domain *msi_domain = NULL;
+	struct device *root_dprc_dev;
+	struct device *bus_dev;
+	struct irq_domain *msi_domain;
 	struct fsl_mc_device *mc_dev = to_fsl_mc_device(dev);
 
-	msi_domain = of_msi_map_get_device_domain(dev, mc_dev->icid,
+	fsl_mc_get_root_dprc(dev, &root_dprc_dev);
+	bus_dev = root_dprc_dev->parent;
+
+	if (bus_dev->of_node) {
+		msi_domain = of_msi_map_get_device_domain(dev,
+						  mc_dev->icid,
 						  DOMAIN_BUS_FSL_MC_MSI);
 
-	/*
-	 * if the msi-map property is missing assume that all the
-	 * child containers inherit the domain from the parent
-	 */
-	if (!msi_domain) {
-		struct device *root_dprc_dev;
-		struct device *bus_dev;
+		/*
+		 * if the msi-map property is missing assume that all the
+		 * child containers inherit the domain from the parent
+		 */
+		if (!msi_domain)
 
-		fsl_mc_get_root_dprc(dev, &root_dprc_dev);
-		bus_dev = root_dprc_dev->parent;
-		msi_domain = of_msi_get_domain(bus_dev,
-					       bus_dev->of_node,
-					       DOMAIN_BUS_FSL_MC_MSI);
+			msi_domain = of_msi_get_domain(bus_dev,
+						bus_dev->of_node,
+						DOMAIN_BUS_FSL_MC_MSI);
+	} else {
+		msi_domain = iort_get_device_domain(dev, mc_dev->icid,
+						    DOMAIN_BUS_FSL_MC_MSI);
 	}
 
 	return msi_domain;
diff --git a/drivers/irqchip/irq-gic-v3-its-fsl-mc-msi.c b/drivers/irqchip/irq-gic-v3-its-fsl-mc-msi.c
index a5c8d577e424..634263dfd7b5 100644
--- a/drivers/irqchip/irq-gic-v3-its-fsl-mc-msi.c
+++ b/drivers/irqchip/irq-gic-v3-its-fsl-mc-msi.c
@@ -7,6 +7,8 @@
  *
  */
 
+#include <linux/acpi.h>
+#include <linux/acpi_iort.h>
 #include <linux/of_device.h>
 #include <linux/of_address.h>
 #include <linux/irq.h>
@@ -30,7 +32,8 @@ static u32 fsl_mc_msi_domain_get_msi_id(struct irq_domain *domain,
 	u32 out_id;
 
 	of_node = irq_domain_get_of_node(domain);
-	out_id = of_msi_map_id(&mc_dev->dev, of_node, mc_dev->icid);
+	out_id = of_node ? of_msi_map_id(&mc_dev->dev, of_node, mc_dev->icid) :
+			iort_msi_map_id(&mc_dev->dev, mc_dev->icid);
 
 	return out_id;
 }
@@ -79,12 +82,71 @@ static const struct of_device_id its_device_id[] = {
 	{},
 };
 
-static int __init its_fsl_mc_msi_init(void)
+static void __init its_fsl_mc_msi_init_one(struct fwnode_handle *handle,
+					  const char *name)
 {
-	struct device_node *np;
 	struct irq_domain *parent;
 	struct irq_domain *mc_msi_domain;
 
+	parent = irq_find_matching_fwnode(handle, DOMAIN_BUS_NEXUS);
+	if (!parent || !msi_get_domain_info(parent)) {
+		pr_err("%s: unable to locate ITS domain\n", name);
+		return;
+	}
+
+	mc_msi_domain = fsl_mc_msi_create_irq_domain(handle,
+						&its_fsl_mc_msi_domain_info,
+						parent);
+	if (!mc_msi_domain) {
+		pr_err("%s: unable to create fsl-mc domain\n", name);
+		return;
+	}
+
+	pr_info("fsl-mc MSI: %s domain created\n", name);
+}
+
+#ifdef CONFIG_ACPI
+static int __init
+its_fsl_mc_msi_parse_madt(union acpi_subtable_headers *header,
+			  const unsigned long end)
+{
+	struct acpi_madt_generic_translator *its_entry;
+	struct fwnode_handle *dom_handle;
+	const char *node_name;
+	int err = 0;
+
+	its_entry = (struct acpi_madt_generic_translator *)header;
+	node_name = kasprintf(GFP_KERNEL, "ITS@0x%lx",
+			      (long)its_entry->base_address);
+
+	dom_handle = iort_find_domain_token(its_entry->translation_id);
+	if (!dom_handle) {
+		pr_err("%s: Unable to locate ITS domain handle\n", node_name);
+		err = -ENXIO;
+		goto out;
+	}
+
+	its_fsl_mc_msi_init_one(dom_handle, node_name);
+
+out:
+	kfree(node_name);
+	return err;
+}
+
+
+static void __init its_fsl_mc_acpi_msi_init(void)
+{
+	acpi_table_parse_madt(ACPI_MADT_TYPE_GENERIC_TRANSLATOR,
+			      its_fsl_mc_msi_parse_madt, 0);
+}
+#else
+static inline void its_fsl_mc_acpi_msi_init(void) { }
+#endif
+
+static void __init its_fsl_mc_of_msi_init(void)
+{
+	struct device_node *np;
+
 	for (np = of_find_matching_node(NULL, its_device_id); np;
 	     np = of_find_matching_node(np, its_device_id)) {
 		if (!of_device_is_available(np))
@@ -92,23 +154,15 @@ static int __init its_fsl_mc_msi_init(void)
 		if (!of_property_read_bool(np, "msi-controller"))
 			continue;
 
-		parent = irq_find_matching_host(np, DOMAIN_BUS_NEXUS);
-		if (!parent || !msi_get_domain_info(parent)) {
-			pr_err("%pOF: unable to locate ITS domain\n", np);
-			continue;
-		}
-
-		mc_msi_domain = fsl_mc_msi_create_irq_domain(
-						 of_node_to_fwnode(np),
-						 &its_fsl_mc_msi_domain_info,
-						 parent);
-		if (!mc_msi_domain) {
-			pr_err("%pOF: unable to create fsl-mc domain\n", np);
-			continue;
-		}
-
-		pr_info("fsl-mc MSI: %pOF domain created\n", np);
+		its_fsl_mc_msi_init_one(of_node_to_fwnode(np),
+					np->full_name);
 	}
+}
+
+static int __init its_fsl_mc_msi_init(void)
+{
+	its_fsl_mc_of_msi_init();
+	its_fsl_mc_acpi_msi_init();
 
 	return 0;
 }

From 7ca8cf5347f720b07a0b32a924b768f5710547e7 Mon Sep 17 00:00:00 2001
From: Herbert Xu <herbert@gondor.apana.org.au>
Date: Wed, 29 Jul 2020 22:31:05 +1000
Subject: [PATCH 477/502] locking/atomic: Move ATOMIC_INIT into linux/types.h

This patch moves ATOMIC_INIT from asm/atomic.h into linux/types.h.
This allows users of atomic_t to use ATOMIC_INIT without having to
include atomic.h as that way may lead to header loops.

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Waiman Long <longman@redhat.com>
Link: https://lkml.kernel.org/r/20200729123105.GB7047@gondor.apana.org.au
---
 arch/alpha/include/asm/atomic.h    | 1 -
 arch/arc/include/asm/atomic.h      | 2 --
 arch/arm/include/asm/atomic.h      | 2 --
 arch/arm64/include/asm/atomic.h    | 2 --
 arch/h8300/include/asm/atomic.h    | 2 --
 arch/hexagon/include/asm/atomic.h  | 2 --
 arch/ia64/include/asm/atomic.h     | 1 -
 arch/m68k/include/asm/atomic.h     | 2 --
 arch/mips/include/asm/atomic.h     | 1 -
 arch/parisc/include/asm/atomic.h   | 2 --
 arch/powerpc/include/asm/atomic.h  | 2 --
 arch/riscv/include/asm/atomic.h    | 2 --
 arch/s390/include/asm/atomic.h     | 2 --
 arch/sh/include/asm/atomic.h       | 2 --
 arch/sparc/include/asm/atomic_32.h | 2 --
 arch/sparc/include/asm/atomic_64.h | 1 -
 arch/x86/include/asm/atomic.h      | 2 --
 arch/xtensa/include/asm/atomic.h   | 2 --
 include/asm-generic/atomic.h       | 2 --
 include/linux/types.h              | 2 ++
 20 files changed, 2 insertions(+), 34 deletions(-)

diff --git a/arch/alpha/include/asm/atomic.h b/arch/alpha/include/asm/atomic.h
index 2144530d1428..e2093994fd0d 100644
--- a/arch/alpha/include/asm/atomic.h
+++ b/arch/alpha/include/asm/atomic.h
@@ -24,7 +24,6 @@
 #define __atomic_acquire_fence()
 #define __atomic_post_full_fence()
 
-#define ATOMIC_INIT(i)		{ (i) }
 #define ATOMIC64_INIT(i)	{ (i) }
 
 #define atomic_read(v)		READ_ONCE((v)->counter)
diff --git a/arch/arc/include/asm/atomic.h b/arch/arc/include/asm/atomic.h
index 7298ce84762e..c614857eb209 100644
--- a/arch/arc/include/asm/atomic.h
+++ b/arch/arc/include/asm/atomic.h
@@ -14,8 +14,6 @@
 #include <asm/barrier.h>
 #include <asm/smp.h>
 
-#define ATOMIC_INIT(i)	{ (i) }
-
 #ifndef CONFIG_ARC_PLAT_EZNPS
 
 #define atomic_read(v)  READ_ONCE((v)->counter)
diff --git a/arch/arm/include/asm/atomic.h b/arch/arm/include/asm/atomic.h
index 75bb2c543e59..455eb19a5ac1 100644
--- a/arch/arm/include/asm/atomic.h
+++ b/arch/arm/include/asm/atomic.h
@@ -15,8 +15,6 @@
 #include <asm/barrier.h>
 #include <asm/cmpxchg.h>
 
-#define ATOMIC_INIT(i)	{ (i) }
-
 #ifdef __KERNEL__
 
 /*
diff --git a/arch/arm64/include/asm/atomic.h b/arch/arm64/include/asm/atomic.h
index a08890da696c..015ddffaf6ca 100644
--- a/arch/arm64/include/asm/atomic.h
+++ b/arch/arm64/include/asm/atomic.h
@@ -99,8 +99,6 @@ static inline long arch_atomic64_dec_if_positive(atomic64_t *v)
 	return __lse_ll_sc_body(atomic64_dec_if_positive, v);
 }
 
-#define ATOMIC_INIT(i)	{ (i) }
-
 #define arch_atomic_read(v)			__READ_ONCE((v)->counter)
 #define arch_atomic_set(v, i)			__WRITE_ONCE(((v)->counter), (i))
 
diff --git a/arch/h8300/include/asm/atomic.h b/arch/h8300/include/asm/atomic.h
index c6b6a06231b2..a990d151f163 100644
--- a/arch/h8300/include/asm/atomic.h
+++ b/arch/h8300/include/asm/atomic.h
@@ -12,8 +12,6 @@
  * resource counting etc..
  */
 
-#define ATOMIC_INIT(i)	{ (i) }
-
 #define atomic_read(v)		READ_ONCE((v)->counter)
 #define atomic_set(v, i)	WRITE_ONCE(((v)->counter), (i))
 
diff --git a/arch/hexagon/include/asm/atomic.h b/arch/hexagon/include/asm/atomic.h
index 0231d69c8bf2..4ab895d7111f 100644
--- a/arch/hexagon/include/asm/atomic.h
+++ b/arch/hexagon/include/asm/atomic.h
@@ -12,8 +12,6 @@
 #include <asm/cmpxchg.h>
 #include <asm/barrier.h>
 
-#define ATOMIC_INIT(i)		{ (i) }
-
 /*  Normal writes in our arch don't clear lock reservations  */
 
 static inline void atomic_set(atomic_t *v, int new)
diff --git a/arch/ia64/include/asm/atomic.h b/arch/ia64/include/asm/atomic.h
index 50440f3ddc43..f267d956458f 100644
--- a/arch/ia64/include/asm/atomic.h
+++ b/arch/ia64/include/asm/atomic.h
@@ -19,7 +19,6 @@
 #include <asm/barrier.h>
 
 
-#define ATOMIC_INIT(i)		{ (i) }
 #define ATOMIC64_INIT(i)	{ (i) }
 
 #define atomic_read(v)		READ_ONCE((v)->counter)
diff --git a/arch/m68k/include/asm/atomic.h b/arch/m68k/include/asm/atomic.h
index 47228b0d4163..756c5cc58f94 100644
--- a/arch/m68k/include/asm/atomic.h
+++ b/arch/m68k/include/asm/atomic.h
@@ -16,8 +16,6 @@
  * We do not have SMP m68k systems, so we don't have to deal with that.
  */
 
-#define ATOMIC_INIT(i)	{ (i) }
-
 #define atomic_read(v)		READ_ONCE((v)->counter)
 #define atomic_set(v, i)	WRITE_ONCE(((v)->counter), (i))
 
diff --git a/arch/mips/include/asm/atomic.h b/arch/mips/include/asm/atomic.h
index e5ac88392d1f..f904084fcb1f 100644
--- a/arch/mips/include/asm/atomic.h
+++ b/arch/mips/include/asm/atomic.h
@@ -45,7 +45,6 @@ static __always_inline type pfx##_xchg(pfx##_t *v, type n)		\
 	return xchg(&v->counter, n);					\
 }
 
-#define ATOMIC_INIT(i)		{ (i) }
 ATOMIC_OPS(atomic, int)
 
 #ifdef CONFIG_64BIT
diff --git a/arch/parisc/include/asm/atomic.h b/arch/parisc/include/asm/atomic.h
index 118953d41763..f960e2f32b1b 100644
--- a/arch/parisc/include/asm/atomic.h
+++ b/arch/parisc/include/asm/atomic.h
@@ -136,8 +136,6 @@ ATOMIC_OPS(xor, ^=)
 #undef ATOMIC_OP_RETURN
 #undef ATOMIC_OP
 
-#define ATOMIC_INIT(i)	{ (i) }
-
 #ifdef CONFIG_64BIT
 
 #define ATOMIC64_INIT(i) { (i) }
diff --git a/arch/powerpc/include/asm/atomic.h b/arch/powerpc/include/asm/atomic.h
index 498785ffc25f..0311c3c42960 100644
--- a/arch/powerpc/include/asm/atomic.h
+++ b/arch/powerpc/include/asm/atomic.h
@@ -11,8 +11,6 @@
 #include <asm/cmpxchg.h>
 #include <asm/barrier.h>
 
-#define ATOMIC_INIT(i)		{ (i) }
-
 /*
  * Since *_return_relaxed and {cmp}xchg_relaxed are implemented with
  * a "bne-" instruction at the end, so an isync is enough as a acquire barrier
diff --git a/arch/riscv/include/asm/atomic.h b/arch/riscv/include/asm/atomic.h
index 96f95c9ebd97..400a8c8b6de7 100644
--- a/arch/riscv/include/asm/atomic.h
+++ b/arch/riscv/include/asm/atomic.h
@@ -19,8 +19,6 @@
 #include <asm/cmpxchg.h>
 #include <asm/barrier.h>
 
-#define ATOMIC_INIT(i)	{ (i) }
-
 #define __atomic_acquire_fence()					\
 	__asm__ __volatile__(RISCV_ACQUIRE_BARRIER "" ::: "memory")
 
diff --git a/arch/s390/include/asm/atomic.h b/arch/s390/include/asm/atomic.h
index 491ad53a0d4e..cae473a7b6f7 100644
--- a/arch/s390/include/asm/atomic.h
+++ b/arch/s390/include/asm/atomic.h
@@ -15,8 +15,6 @@
 #include <asm/barrier.h>
 #include <asm/cmpxchg.h>
 
-#define ATOMIC_INIT(i)  { (i) }
-
 static inline int atomic_read(const atomic_t *v)
 {
 	int c;
diff --git a/arch/sh/include/asm/atomic.h b/arch/sh/include/asm/atomic.h
index f37b95a80232..7c2a8a703b9a 100644
--- a/arch/sh/include/asm/atomic.h
+++ b/arch/sh/include/asm/atomic.h
@@ -19,8 +19,6 @@
 #include <asm/cmpxchg.h>
 #include <asm/barrier.h>
 
-#define ATOMIC_INIT(i)	{ (i) }
-
 #define atomic_read(v)		READ_ONCE((v)->counter)
 #define atomic_set(v,i)		WRITE_ONCE((v)->counter, (i))
 
diff --git a/arch/sparc/include/asm/atomic_32.h b/arch/sparc/include/asm/atomic_32.h
index 94c930f0bc62..efad5532f169 100644
--- a/arch/sparc/include/asm/atomic_32.h
+++ b/arch/sparc/include/asm/atomic_32.h
@@ -18,8 +18,6 @@
 #include <asm/barrier.h>
 #include <asm-generic/atomic64.h>
 
-#define ATOMIC_INIT(i)  { (i) }
-
 int atomic_add_return(int, atomic_t *);
 int atomic_fetch_add(int, atomic_t *);
 int atomic_fetch_and(int, atomic_t *);
diff --git a/arch/sparc/include/asm/atomic_64.h b/arch/sparc/include/asm/atomic_64.h
index b60448397d4f..6b235d3d1d9d 100644
--- a/arch/sparc/include/asm/atomic_64.h
+++ b/arch/sparc/include/asm/atomic_64.h
@@ -12,7 +12,6 @@
 #include <asm/cmpxchg.h>
 #include <asm/barrier.h>
 
-#define ATOMIC_INIT(i)		{ (i) }
 #define ATOMIC64_INIT(i)	{ (i) }
 
 #define atomic_read(v)		READ_ONCE((v)->counter)
diff --git a/arch/x86/include/asm/atomic.h b/arch/x86/include/asm/atomic.h
index bf35e476a776..b6cac6e9bb70 100644
--- a/arch/x86/include/asm/atomic.h
+++ b/arch/x86/include/asm/atomic.h
@@ -14,8 +14,6 @@
  * resource counting etc..
  */
 
-#define ATOMIC_INIT(i)	{ (i) }
-
 /**
  * arch_atomic_read - read atomic variable
  * @v: pointer of type atomic_t
diff --git a/arch/xtensa/include/asm/atomic.h b/arch/xtensa/include/asm/atomic.h
index 3e7c6134ed32..744c2f463845 100644
--- a/arch/xtensa/include/asm/atomic.h
+++ b/arch/xtensa/include/asm/atomic.h
@@ -19,8 +19,6 @@
 #include <asm/cmpxchg.h>
 #include <asm/barrier.h>
 
-#define ATOMIC_INIT(i)	{ (i) }
-
 /*
  * This Xtensa implementation assumes that the right mechanism
  * for exclusion is for locking interrupts to level EXCM_LEVEL.
diff --git a/include/asm-generic/atomic.h b/include/asm-generic/atomic.h
index 286867f593d2..11f96f40f4a7 100644
--- a/include/asm-generic/atomic.h
+++ b/include/asm-generic/atomic.h
@@ -159,8 +159,6 @@ ATOMIC_OP(xor, ^)
  * resource counting etc..
  */
 
-#define ATOMIC_INIT(i)	{ (i) }
-
 /**
  * atomic_read - read atomic variable
  * @v: pointer of type atomic_t
diff --git a/include/linux/types.h b/include/linux/types.h
index d3021c879179..a147977602b5 100644
--- a/include/linux/types.h
+++ b/include/linux/types.h
@@ -167,6 +167,8 @@ typedef struct {
 	int counter;
 } atomic_t;
 
+#define ATOMIC_INIT(i) { (i) }
+
 #ifdef CONFIG_64BIT
 typedef struct {
 	s64 counter;

From 459e39538e612b8dd130d34b93c9bfc89ecc836c Mon Sep 17 00:00:00 2001
From: Herbert Xu <herbert@gondor.apana.org.au>
Date: Wed, 29 Jul 2020 22:33:16 +1000
Subject: [PATCH 478/502] locking/qspinlock: Do not include atomic.h from
 qspinlock_types.h

This patch breaks a header loop involving qspinlock_types.h.
The issue is that qspinlock_types.h includes atomic.h, which then
eventually includes kernel.h which could lead back to the original
file via spinlock_types.h.

As ATOMIC_INIT is now defined by linux/types.h, there is no longer
any need to include atomic.h from qspinlock_types.h.  This also
allows the CONFIG_PARAVIRT hack to be removed since it was trying
to prevent exactly this loop.

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Waiman Long <longman@redhat.com>
Link: https://lkml.kernel.org/r/20200729123316.GC7047@gondor.apana.org.au
---
 include/asm-generic/qspinlock.h       | 1 +
 include/asm-generic/qspinlock_types.h | 8 --------
 2 files changed, 1 insertion(+), 8 deletions(-)

diff --git a/include/asm-generic/qspinlock.h b/include/asm-generic/qspinlock.h
index fde943d180e0..2b26cd729b94 100644
--- a/include/asm-generic/qspinlock.h
+++ b/include/asm-generic/qspinlock.h
@@ -11,6 +11,7 @@
 #define __ASM_GENERIC_QSPINLOCK_H
 
 #include <asm-generic/qspinlock_types.h>
+#include <linux/atomic.h>
 
 /**
  * queued_spin_is_locked - is the spinlock locked?
diff --git a/include/asm-generic/qspinlock_types.h b/include/asm-generic/qspinlock_types.h
index 56d1309d32f8..2fd1fb89ec36 100644
--- a/include/asm-generic/qspinlock_types.h
+++ b/include/asm-generic/qspinlock_types.h
@@ -9,15 +9,7 @@
 #ifndef __ASM_GENERIC_QSPINLOCK_TYPES_H
 #define __ASM_GENERIC_QSPINLOCK_TYPES_H
 
-/*
- * Including atomic.h with PARAVIRT on will cause compilation errors because
- * of recursive header file incluson via paravirt_types.h. So don't include
- * it if PARAVIRT is on.
- */
-#ifndef CONFIG_PARAVIRT
 #include <linux/types.h>
-#include <linux/atomic.h>
-#endif
 
 typedef struct qspinlock {
 	union {

From 0d24f65e933ca89d55d17f6dbdb2a72ca88f0992 Mon Sep 17 00:00:00 2001
From: "Ahmed S. Darwish" <a.darwish@linutronix.de>
Date: Mon, 20 Jul 2020 17:55:07 +0200
Subject: [PATCH 479/502] Documentation: locking: Describe seqlock design and
 usage

Proper documentation for the design and usage of sequence counters and
sequential locks does not exist. Complete the seqlock.h documentation as
follows:

  - Divide all documentation on a seqcount_t vs. seqlock_t basis. The
    description for both mechanisms was intermingled, which is incorrect
    since the usage constrains for each type are vastly different.

  - Add an introductory paragraph describing the internal design of, and
    rationale for, sequence counters.

  - Document seqcount_t writer non-preemptibility requirement, which was
    not previously documented anywhere, and provide a clear rationale.

  - Provide template code for seqcount_t and seqlock_t initialization
    and reader/writer critical sections.

  - Recommend using seqlock_t by default. It implicitly handles the
    serialization and non-preemptibility requirements of writers.

At seqlock.h:

  - Remove references to brlocks as they've long been removed from the
    kernel.

  - Remove references to gcc-3.x since the kernel's minimum supported
    gcc version is 4.9.

References: 0f6ed63b1707 ("no need to keep brlock macros anymore...")
References: 6ec4476ac825 ("Raise gcc version requirement to 4.9")
Signed-off-by: Ahmed S. Darwish <a.darwish@linutronix.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/20200720155530.1173732-2-a.darwish@linutronix.de
---
 Documentation/locking/index.rst   |   1 +
 Documentation/locking/seqlock.rst | 170 ++++++++++++++++++++++++++++++
 include/linux/seqlock.h           |  81 +++++++-------
 3 files changed, 209 insertions(+), 43 deletions(-)
 create mode 100644 Documentation/locking/seqlock.rst

diff --git a/Documentation/locking/index.rst b/Documentation/locking/index.rst
index d785878cad65..7003bd5aeff4 100644
--- a/Documentation/locking/index.rst
+++ b/Documentation/locking/index.rst
@@ -14,6 +14,7 @@ locking
     mutex-design
     rt-mutex-design
     rt-mutex
+    seqlock
     spinlocks
     ww-mutex-design
     preempt-locking
diff --git a/Documentation/locking/seqlock.rst b/Documentation/locking/seqlock.rst
new file mode 100644
index 000000000000..366dd368d90a
--- /dev/null
+++ b/Documentation/locking/seqlock.rst
@@ -0,0 +1,170 @@
+======================================
+Sequence counters and sequential locks
+======================================
+
+Introduction
+============
+
+Sequence counters are a reader-writer consistency mechanism with
+lockless readers (read-only retry loops), and no writer starvation. They
+are used for data that's rarely written to (e.g. system time), where the
+reader wants a consistent set of information and is willing to retry if
+that information changes.
+
+A data set is consistent when the sequence count at the beginning of the
+read side critical section is even and the same sequence count value is
+read again at the end of the critical section. The data in the set must
+be copied out inside the read side critical section. If the sequence
+count has changed between the start and the end of the critical section,
+the reader must retry.
+
+Writers increment the sequence count at the start and the end of their
+critical section. After starting the critical section the sequence count
+is odd and indicates to the readers that an update is in progress. At
+the end of the write side critical section the sequence count becomes
+even again which lets readers make progress.
+
+A sequence counter write side critical section must never be preempted
+or interrupted by read side sections. Otherwise the reader will spin for
+the entire scheduler tick due to the odd sequence count value and the
+interrupted writer. If that reader belongs to a real-time scheduling
+class, it can spin forever and the kernel will livelock.
+
+This mechanism cannot be used if the protected data contains pointers,
+as the writer can invalidate a pointer that the reader is following.
+
+
+.. _seqcount_t:
+
+Sequence counters (``seqcount_t``)
+==================================
+
+This is the the raw counting mechanism, which does not protect against
+multiple writers.  Write side critical sections must thus be serialized
+by an external lock.
+
+If the write serialization primitive is not implicitly disabling
+preemption, preemption must be explicitly disabled before entering the
+write side section. If the read section can be invoked from hardirq or
+softirq contexts, interrupts or bottom halves must also be respectively
+disabled before entering the write section.
+
+If it's desired to automatically handle the sequence counter
+requirements of writer serialization and non-preemptibility, use
+:ref:`seqlock_t` instead.
+
+Initialization::
+
+	/* dynamic */
+	seqcount_t foo_seqcount;
+	seqcount_init(&foo_seqcount);
+
+	/* static */
+	static seqcount_t foo_seqcount = SEQCNT_ZERO(foo_seqcount);
+
+	/* C99 struct init */
+	struct {
+		.seq   = SEQCNT_ZERO(foo.seq),
+	} foo;
+
+Write path::
+
+	/* Serialized context with disabled preemption */
+
+	write_seqcount_begin(&foo_seqcount);
+
+	/* ... [[write-side critical section]] ... */
+
+	write_seqcount_end(&foo_seqcount);
+
+Read path::
+
+	do {
+		seq = read_seqcount_begin(&foo_seqcount);
+
+		/* ... [[read-side critical section]] ... */
+
+	} while (read_seqcount_retry(&foo_seqcount, seq));
+
+
+.. _seqlock_t:
+
+Sequential locks (``seqlock_t``)
+================================
+
+This contains the :ref:`seqcount_t` mechanism earlier discussed, plus an
+embedded spinlock for writer serialization and non-preemptibility.
+
+If the read side section can be invoked from hardirq or softirq context,
+use the write side function variants which disable interrupts or bottom
+halves respectively.
+
+Initialization::
+
+	/* dynamic */
+	seqlock_t foo_seqlock;
+	seqlock_init(&foo_seqlock);
+
+	/* static */
+	static DEFINE_SEQLOCK(foo_seqlock);
+
+	/* C99 struct init */
+	struct {
+		.seql   = __SEQLOCK_UNLOCKED(foo.seql)
+	} foo;
+
+Write path::
+
+	write_seqlock(&foo_seqlock);
+
+	/* ... [[write-side critical section]] ... */
+
+	write_sequnlock(&foo_seqlock);
+
+Read path, three categories:
+
+1. Normal Sequence readers which never block a writer but they must
+   retry if a writer is in progress by detecting change in the sequence
+   number.  Writers do not wait for a sequence reader::
+
+	do {
+		seq = read_seqbegin(&foo_seqlock);
+
+		/* ... [[read-side critical section]] ... */
+
+	} while (read_seqretry(&foo_seqlock, seq));
+
+2. Locking readers which will wait if a writer or another locking reader
+   is in progress. A locking reader in progress will also block a writer
+   from entering its critical section. This read lock is
+   exclusive. Unlike rwlock_t, only one locking reader can acquire it::
+
+	read_seqlock_excl(&foo_seqlock);
+
+	/* ... [[read-side critical section]] ... */
+
+	read_sequnlock_excl(&foo_seqlock);
+
+3. Conditional lockless reader (as in 1), or locking reader (as in 2),
+   according to a passed marker. This is used to avoid lockless readers
+   starvation (too much retry loops) in case of a sharp spike in write
+   activity. First, a lockless read is tried (even marker passed). If
+   that trial fails (odd sequence counter is returned, which is used as
+   the next iteration marker), the lockless read is transformed to a
+   full locking read and no retry loop is necessary::
+
+	/* marker; even initialization */
+	int seq = 0;
+	do {
+		read_seqbegin_or_lock(&foo_seqlock, &seq);
+
+		/* ... [[read-side critical section]] ... */
+
+	} while (need_seqretry(&foo_seqlock, seq));
+	done_seqretry(&foo_seqlock, seq);
+
+
+API documentation
+=================
+
+.. kernel-doc:: include/linux/seqlock.h
diff --git a/include/linux/seqlock.h b/include/linux/seqlock.h
index 8b97204f35a7..299d68f10325 100644
--- a/include/linux/seqlock.h
+++ b/include/linux/seqlock.h
@@ -1,36 +1,15 @@
 /* SPDX-License-Identifier: GPL-2.0 */
 #ifndef __LINUX_SEQLOCK_H
 #define __LINUX_SEQLOCK_H
+
 /*
- * Reader/writer consistent mechanism without starving writers. This type of
- * lock for data where the reader wants a consistent set of information
- * and is willing to retry if the information changes. There are two types
- * of readers:
- * 1. Sequence readers which never block a writer but they may have to retry
- *    if a writer is in progress by detecting change in sequence number.
- *    Writers do not wait for a sequence reader.
- * 2. Locking readers which will wait if a writer or another locking reader
- *    is in progress. A locking reader in progress will also block a writer
- *    from going forward. Unlike the regular rwlock, the read lock here is
- *    exclusive so that only one locking reader can get it.
+ * seqcount_t / seqlock_t - a reader-writer consistency mechanism with
+ * lockless readers (read-only retry loops), and no writer starvation.
  *
- * This is not as cache friendly as brlock. Also, this may not work well
- * for data that contains pointers, because any writer could
- * invalidate a pointer that a reader was following.
+ * See Documentation/locking/seqlock.rst
  *
- * Expected non-blocking reader usage:
- * 	do {
- *	    seq = read_seqbegin(&foo);
- * 	...
- *      } while (read_seqretry(&foo, seq));
- *
- *
- * On non-SMP the spin locks disappear but the writer still needs
- * to increment the sequence variables because an interrupt routine could
- * change the state of the data.
- *
- * Based on x86_64 vsyscall gettimeofday 
- * by Keith Owens and Andrea Arcangeli
+ * Copyrights:
+ * - Based on x86_64 vsyscall gettimeofday: Keith Owens, Andrea Arcangeli
  */
 
 #include <linux/spinlock.h>
@@ -41,8 +20,8 @@
 #include <asm/processor.h>
 
 /*
- * The seqlock interface does not prescribe a precise sequence of read
- * begin/retry/end. For readers, typically there is a call to
+ * The seqlock seqcount_t interface does not prescribe a precise sequence of
+ * read begin/retry/end. For readers, typically there is a call to
  * read_seqcount_begin() and read_seqcount_retry(), however, there are more
  * esoteric cases which do not follow this pattern.
  *
@@ -50,16 +29,30 @@
  * via seqcount_t under KCSAN: upon beginning a seq-reader critical section,
  * pessimistically mark the next KCSAN_SEQLOCK_REGION_MAX memory accesses as
  * atomics; if there is a matching read_seqcount_retry() call, no following
- * memory operations are considered atomic. Usage of seqlocks via seqlock_t
- * interface is not affected.
+ * memory operations are considered atomic. Usage of the seqlock_t interface
+ * is not affected.
  */
 #define KCSAN_SEQLOCK_REGION_MAX 1000
 
 /*
- * Version using sequence counter only.
- * This can be used when code has its own mutex protecting the
- * updating starting before the write_seqcountbeqin() and ending
- * after the write_seqcount_end().
+ * Sequence counters (seqcount_t)
+ *
+ * This is the raw counting mechanism, without any writer protection.
+ *
+ * Write side critical sections must be serialized and non-preemptible.
+ *
+ * If readers can be invoked from hardirq or softirq contexts,
+ * interrupts or bottom halves must also be respectively disabled before
+ * entering the write section.
+ *
+ * This mechanism can't be used if the protected data contains pointers,
+ * as the writer can invalidate a pointer that a reader is following.
+ *
+ * If it's desired to automatically handle the sequence counter writer
+ * serialization and non-preemptibility requirements, use a sequential
+ * lock (seqlock_t) instead.
+ *
+ * See Documentation/locking/seqlock.rst
  */
 typedef struct seqcount {
 	unsigned sequence;
@@ -398,10 +391,6 @@ static inline void raw_write_seqcount_latch(seqcount_t *s)
        smp_wmb();      /* increment "sequence" before following stores */
 }
 
-/*
- * Sequence counter only version assumes that callers are using their
- * own mutexing.
- */
 static inline void write_seqcount_begin_nested(seqcount_t *s, int subclass)
 {
 	raw_write_seqcount_begin(s);
@@ -434,15 +423,21 @@ static inline void write_seqcount_invalidate(seqcount_t *s)
 	kcsan_nestable_atomic_end();
 }
 
+/*
+ * Sequential locks (seqlock_t)
+ *
+ * Sequence counters with an embedded spinlock for writer serialization
+ * and non-preemptibility.
+ *
+ * For more info, see:
+ *    - Comments on top of seqcount_t
+ *    - Documentation/locking/seqlock.rst
+ */
 typedef struct {
 	struct seqcount seqcount;
 	spinlock_t lock;
 } seqlock_t;
 
-/*
- * These macros triggered gcc-3.x compile-time problems.  We think these are
- * OK now.  Be cautious.
- */
 #define __SEQLOCK_UNLOCKED(lockname)			\
 	{						\
 		.seqcount = SEQCNT_ZERO(lockname),	\

From 15cbe67bbd3adeb4854c42713dbeaf2ff876beee Mon Sep 17 00:00:00 2001
From: "Ahmed S. Darwish" <a.darwish@linutronix.de>
Date: Mon, 20 Jul 2020 17:55:08 +0200
Subject: [PATCH 480/502] seqlock: Properly format kernel-doc code samples

Align the code samples and note sections inside kernel-doc comments with
tabs. This way they can be properly parsed and rendered by Sphinx. It
also makes the code samples easier to read from text editors.

Signed-off-by: Ahmed S. Darwish <a.darwish@linutronix.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/20200720155530.1173732-3-a.darwish@linutronix.de
---
 include/linux/seqlock.h | 108 +++++++++++++++++++++-------------------
 1 file changed, 56 insertions(+), 52 deletions(-)

diff --git a/include/linux/seqlock.h b/include/linux/seqlock.h
index 299d68f10325..6c4f68ef1393 100644
--- a/include/linux/seqlock.h
+++ b/include/linux/seqlock.h
@@ -263,32 +263,32 @@ static inline void raw_write_seqcount_end(seqcount_t *s)
  * atomically, avoiding compiler optimizations; b) to document which writes are
  * meant to propagate to the reader critical section. This is necessary because
  * neither writes before and after the barrier are enclosed in a seq-writer
- * critical section that would ensure readers are aware of ongoing writes.
+ * critical section that would ensure readers are aware of ongoing writes::
  *
- *      seqcount_t seq;
- *      bool X = true, Y = false;
+ *	seqcount_t seq;
+ *	bool X = true, Y = false;
  *
- *      void read(void)
- *      {
- *              bool x, y;
+ *	void read(void)
+ *	{
+ *		bool x, y;
  *
- *              do {
- *                      int s = read_seqcount_begin(&seq);
+ *		do {
+ *			int s = read_seqcount_begin(&seq);
  *
- *                      x = X; y = Y;
+ *			x = X; y = Y;
  *
- *              } while (read_seqcount_retry(&seq, s));
+ *		} while (read_seqcount_retry(&seq, s));
  *
- *              BUG_ON(!x && !y);
+ *		BUG_ON(!x && !y);
  *      }
  *
  *      void write(void)
  *      {
- *              WRITE_ONCE(Y, true);
+ *		WRITE_ONCE(Y, true);
  *
- *              raw_write_seqcount_barrier(seq);
+ *		raw_write_seqcount_barrier(seq);
  *
- *              WRITE_ONCE(X, false);
+ *		WRITE_ONCE(X, false);
  *      }
  */
 static inline void raw_write_seqcount_barrier(seqcount_t *s)
@@ -325,64 +325,68 @@ static inline int raw_read_seqcount_latch(seqcount_t *s)
  * Very simply put: we first modify one copy and then the other. This ensures
  * there is always one copy in a stable state, ready to give us an answer.
  *
- * The basic form is a data structure like:
+ * The basic form is a data structure like::
  *
- * struct latch_struct {
- *	seqcount_t		seq;
- *	struct data_struct	data[2];
- * };
+ *	struct latch_struct {
+ *		seqcount_t		seq;
+ *		struct data_struct	data[2];
+ *	};
  *
  * Where a modification, which is assumed to be externally serialized, does the
- * following:
+ * following::
  *
- * void latch_modify(struct latch_struct *latch, ...)
- * {
- *	smp_wmb();	<- Ensure that the last data[1] update is visible
- *	latch->seq++;
- *	smp_wmb();	<- Ensure that the seqcount update is visible
+ *	void latch_modify(struct latch_struct *latch, ...)
+ *	{
+ *		smp_wmb();	// Ensure that the last data[1] update is visible
+ *		latch->seq++;
+ *		smp_wmb();	// Ensure that the seqcount update is visible
  *
- *	modify(latch->data[0], ...);
+ *		modify(latch->data[0], ...);
  *
- *	smp_wmb();	<- Ensure that the data[0] update is visible
- *	latch->seq++;
- *	smp_wmb();	<- Ensure that the seqcount update is visible
+ *		smp_wmb();	// Ensure that the data[0] update is visible
+ *		latch->seq++;
+ *		smp_wmb();	// Ensure that the seqcount update is visible
  *
- *	modify(latch->data[1], ...);
- * }
+ *		modify(latch->data[1], ...);
+ *	}
  *
- * The query will have a form like:
+ * The query will have a form like::
  *
- * struct entry *latch_query(struct latch_struct *latch, ...)
- * {
- *	struct entry *entry;
- *	unsigned seq, idx;
+ *	struct entry *latch_query(struct latch_struct *latch, ...)
+ *	{
+ *		struct entry *entry;
+ *		unsigned seq, idx;
  *
- *	do {
- *		seq = raw_read_seqcount_latch(&latch->seq);
+ *		do {
+ *			seq = raw_read_seqcount_latch(&latch->seq);
  *
- *		idx = seq & 0x01;
- *		entry = data_query(latch->data[idx], ...);
+ *			idx = seq & 0x01;
+ *			entry = data_query(latch->data[idx], ...);
  *
- *		smp_rmb();
- *	} while (seq != latch->seq);
+ *			smp_rmb();
+ *		} while (seq != latch->seq);
  *
- *	return entry;
- * }
+ *		return entry;
+ *	}
  *
  * So during the modification, queries are first redirected to data[1]. Then we
  * modify data[0]. When that is complete, we redirect queries back to data[0]
  * and we can modify data[1].
  *
- * NOTE: The non-requirement for atomic modifications does _NOT_ include
- *       the publishing of new entries in the case where data is a dynamic
- *       data structure.
+ * NOTE:
  *
- *       An iteration might start in data[0] and get suspended long enough
- *       to miss an entire modification sequence, once it resumes it might
- *       observe the new entry.
+ *	The non-requirement for atomic modifications does _NOT_ include
+ *	the publishing of new entries in the case where data is a dynamic
+ *	data structure.
  *
- * NOTE: When data is a dynamic data structure; one should use regular RCU
- *       patterns to manage the lifetimes of the objects within.
+ *	An iteration might start in data[0] and get suspended long enough
+ *	to miss an entire modification sequence, once it resumes it might
+ *	observe the new entry.
+ *
+ * NOTE:
+ *
+ *	When data is a dynamic data structure; one should use regular RCU
+ *	patterns to manage the lifetimes of the objects within.
  */
 static inline void raw_write_seqcount_latch(seqcount_t *s)
 {

From d3b35b87f436c1b226a8061bee9c8875ba6658bd Mon Sep 17 00:00:00 2001
From: "Ahmed S. Darwish" <a.darwish@linutronix.de>
Date: Mon, 20 Jul 2020 17:55:09 +0200
Subject: [PATCH 481/502] seqlock: seqcount_t latch: End read sections with
 read_seqcount_retry()

The seqcount_t latch reader example at the raw_write_seqcount_latch()
kernel-doc comment ends the latch read section with a manual smp memory
barrier and sequence counter comparison.

This is technically correct, but it is suboptimal: read_seqcount_retry()
already contains the same logic of an smp memory barrier and sequence
counter comparison.

End the latch read critical section example with read_seqcount_retry().

Signed-off-by: Ahmed S. Darwish <a.darwish@linutronix.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/20200720155530.1173732-4-a.darwish@linutronix.de
---
 include/linux/seqlock.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/include/linux/seqlock.h b/include/linux/seqlock.h
index 6c4f68ef1393..d724b5e5408d 100644
--- a/include/linux/seqlock.h
+++ b/include/linux/seqlock.h
@@ -363,8 +363,8 @@ static inline int raw_read_seqcount_latch(seqcount_t *s)
  *			idx = seq & 0x01;
  *			entry = data_query(latch->data[idx], ...);
  *
- *			smp_rmb();
- *		} while (seq != latch->seq);
+ *		// read_seqcount_retry() includes needed smp_rmb()
+ *		} while (read_seqcount_retry(&latch->seq, seq));
  *
  *		return entry;
  *	}

From f4a27cbcec90ac04ee60e04b222e1449dcdba0bd Mon Sep 17 00:00:00 2001
From: "Ahmed S. Darwish" <a.darwish@linutronix.de>
Date: Mon, 20 Jul 2020 17:55:10 +0200
Subject: [PATCH 482/502] seqlock: Reorder seqcount_t and seqlock_t API
 definitions

The seqlock.h seqcount_t and seqlock_t API definitions are presented in
the chronological order of their development rather than the order that
makes most sense to readers. This makes it hard to follow and understand
the header file code.

Group and reorder all of the exported seqlock.h functions according to
their function.

First, group together the seqcount_t standard read path functions:

    - __read_seqcount_begin()
    - raw_read_seqcount_begin()
    - read_seqcount_begin()

since each function is implemented exactly in terms of the one above
it. Then, group the special-case seqcount_t readers on their own as:

    - raw_read_seqcount()
    - raw_seqcount_begin()

since the only difference between the two functions is that the second
one masks the sequence counter LSB while the first one does not. Note
that raw_seqcount_begin() can actually be implemented in terms of
raw_read_seqcount(), which will be done in a follow-up commit.

Then, group the seqcount_t write path functions, instead of injecting
unrelated seqcount_t latch functions between them, and order them as:

    - raw_write_seqcount_begin()
    - raw_write_seqcount_end()
    - write_seqcount_begin_nested()
    - write_seqcount_begin()
    - write_seqcount_end()
    - raw_write_seqcount_barrier()
    - write_seqcount_invalidate()

which is the expected natural order. This also isolates the seqcount_t
latch functions into their own area, at the end of the sequence counters
section, and before jumping to the next one: sequential locks
(seqlock_t).

Do a similar grouping and reordering for seqlock_t "locking" readers vs.
the "conditionally locking or lockless" ones.

No implementation code was changed in any of the reordering above.

Signed-off-by: Ahmed S. Darwish <a.darwish@linutronix.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/20200720155530.1173732-5-a.darwish@linutronix.de
---
 include/linux/seqlock.h | 158 ++++++++++++++++++++--------------------
 1 file changed, 78 insertions(+), 80 deletions(-)

diff --git a/include/linux/seqlock.h b/include/linux/seqlock.h
index d724b5e5408d..4c1456008d89 100644
--- a/include/linux/seqlock.h
+++ b/include/linux/seqlock.h
@@ -128,23 +128,6 @@ repeat:
 	return ret;
 }
 
-/**
- * raw_read_seqcount - Read the raw seqcount
- * @s: pointer to seqcount_t
- * Returns: count to be passed to read_seqcount_retry
- *
- * raw_read_seqcount opens a read critical section of the given
- * seqcount without any lockdep checking and without checking or
- * masking the LSB. Calling code is responsible for handling that.
- */
-static inline unsigned raw_read_seqcount(const seqcount_t *s)
-{
-	unsigned ret = READ_ONCE(s->sequence);
-	smp_rmb();
-	kcsan_atomic_next(KCSAN_SEQLOCK_REGION_MAX);
-	return ret;
-}
-
 /**
  * raw_read_seqcount_begin - start seq-read critical section w/o lockdep
  * @s: pointer to seqcount_t
@@ -176,6 +159,23 @@ static inline unsigned read_seqcount_begin(const seqcount_t *s)
 	return raw_read_seqcount_begin(s);
 }
 
+/**
+ * raw_read_seqcount - Read the raw seqcount
+ * @s: pointer to seqcount_t
+ * Returns: count to be passed to read_seqcount_retry
+ *
+ * raw_read_seqcount opens a read critical section of the given
+ * seqcount without any lockdep checking and without checking or
+ * masking the LSB. Calling code is responsible for handling that.
+ */
+static inline unsigned raw_read_seqcount(const seqcount_t *s)
+{
+	unsigned ret = READ_ONCE(s->sequence);
+	smp_rmb();
+	kcsan_atomic_next(KCSAN_SEQLOCK_REGION_MAX);
+	return ret;
+}
+
 /**
  * raw_seqcount_begin - begin a seq-read critical section
  * @s: pointer to seqcount_t
@@ -234,8 +234,6 @@ static inline int read_seqcount_retry(const seqcount_t *s, unsigned start)
 	return __read_seqcount_retry(s, start);
 }
 
-
-
 static inline void raw_write_seqcount_begin(seqcount_t *s)
 {
 	kcsan_nestable_atomic_begin();
@@ -250,6 +248,23 @@ static inline void raw_write_seqcount_end(seqcount_t *s)
 	kcsan_nestable_atomic_end();
 }
 
+static inline void write_seqcount_begin_nested(seqcount_t *s, int subclass)
+{
+	raw_write_seqcount_begin(s);
+	seqcount_acquire(&s->dep_map, subclass, 0, _RET_IP_);
+}
+
+static inline void write_seqcount_begin(seqcount_t *s)
+{
+	write_seqcount_begin_nested(s, 0);
+}
+
+static inline void write_seqcount_end(seqcount_t *s)
+{
+	seqcount_release(&s->dep_map, _RET_IP_);
+	raw_write_seqcount_end(s);
+}
+
 /**
  * raw_write_seqcount_barrier - do a seq write barrier
  * @s: pointer to seqcount_t
@@ -300,6 +315,21 @@ static inline void raw_write_seqcount_barrier(seqcount_t *s)
 	kcsan_nestable_atomic_end();
 }
 
+/**
+ * write_seqcount_invalidate - invalidate in-progress read-side seq operations
+ * @s: pointer to seqcount_t
+ *
+ * After write_seqcount_invalidate, no read-side seq operations will complete
+ * successfully and see data older than this.
+ */
+static inline void write_seqcount_invalidate(seqcount_t *s)
+{
+	smp_wmb();
+	kcsan_nestable_atomic_begin();
+	s->sequence+=2;
+	kcsan_nestable_atomic_end();
+}
+
 static inline int raw_read_seqcount_latch(seqcount_t *s)
 {
 	/* Pairs with the first smp_wmb() in raw_write_seqcount_latch() */
@@ -395,38 +425,6 @@ static inline void raw_write_seqcount_latch(seqcount_t *s)
        smp_wmb();      /* increment "sequence" before following stores */
 }
 
-static inline void write_seqcount_begin_nested(seqcount_t *s, int subclass)
-{
-	raw_write_seqcount_begin(s);
-	seqcount_acquire(&s->dep_map, subclass, 0, _RET_IP_);
-}
-
-static inline void write_seqcount_begin(seqcount_t *s)
-{
-	write_seqcount_begin_nested(s, 0);
-}
-
-static inline void write_seqcount_end(seqcount_t *s)
-{
-	seqcount_release(&s->dep_map, _RET_IP_);
-	raw_write_seqcount_end(s);
-}
-
-/**
- * write_seqcount_invalidate - invalidate in-progress read-side seq operations
- * @s: pointer to seqcount_t
- *
- * After write_seqcount_invalidate, no read-side seq operations will complete
- * successfully and see data older than this.
- */
-static inline void write_seqcount_invalidate(seqcount_t *s)
-{
-	smp_wmb();
-	kcsan_nestable_atomic_begin();
-	s->sequence+=2;
-	kcsan_nestable_atomic_end();
-}
-
 /*
  * Sequential locks (seqlock_t)
  *
@@ -555,35 +553,6 @@ static inline void read_sequnlock_excl(seqlock_t *sl)
 	spin_unlock(&sl->lock);
 }
 
-/**
- * read_seqbegin_or_lock - begin a sequence number check or locking block
- * @lock: sequence lock
- * @seq : sequence number to be checked
- *
- * First try it once optimistically without taking the lock. If that fails,
- * take the lock. The sequence number is also used as a marker for deciding
- * whether to be a reader (even) or writer (odd).
- * N.B. seq must be initialized to an even number to begin with.
- */
-static inline void read_seqbegin_or_lock(seqlock_t *lock, int *seq)
-{
-	if (!(*seq & 1))	/* Even */
-		*seq = read_seqbegin(lock);
-	else			/* Odd */
-		read_seqlock_excl(lock);
-}
-
-static inline int need_seqretry(seqlock_t *lock, int seq)
-{
-	return !(seq & 1) && read_seqretry(lock, seq);
-}
-
-static inline void done_seqretry(seqlock_t *lock, int seq)
-{
-	if (seq & 1)
-		read_sequnlock_excl(lock);
-}
-
 static inline void read_seqlock_excl_bh(seqlock_t *sl)
 {
 	spin_lock_bh(&sl->lock);
@@ -621,6 +590,35 @@ read_sequnlock_excl_irqrestore(seqlock_t *sl, unsigned long flags)
 	spin_unlock_irqrestore(&sl->lock, flags);
 }
 
+/**
+ * read_seqbegin_or_lock - begin a sequence number check or locking block
+ * @lock: sequence lock
+ * @seq : sequence number to be checked
+ *
+ * First try it once optimistically without taking the lock. If that fails,
+ * take the lock. The sequence number is also used as a marker for deciding
+ * whether to be a reader (even) or writer (odd).
+ * N.B. seq must be initialized to an even number to begin with.
+ */
+static inline void read_seqbegin_or_lock(seqlock_t *lock, int *seq)
+{
+	if (!(*seq & 1))	/* Even */
+		*seq = read_seqbegin(lock);
+	else			/* Odd */
+		read_seqlock_excl(lock);
+}
+
+static inline int need_seqretry(seqlock_t *lock, int seq)
+{
+	return !(seq & 1) && read_seqretry(lock, seq);
+}
+
+static inline void done_seqretry(seqlock_t *lock, int seq)
+{
+	if (seq & 1)
+		read_sequnlock_excl(lock);
+}
+
 static inline unsigned long
 read_seqbegin_or_lock_irqsave(seqlock_t *lock, int *seq)
 {

From 89b88845e05752b3d684eaf147f457c8dfa99c5f Mon Sep 17 00:00:00 2001
From: "Ahmed S. Darwish" <a.darwish@linutronix.de>
Date: Mon, 20 Jul 2020 17:55:11 +0200
Subject: [PATCH 483/502] seqlock: Add kernel-doc for seqcount_t and seqlock_t
 APIs

seqlock.h is now included by kernel's RST documentation, but a small
number of the the exported seqlock.h functions are kernel-doc annotated.

Add kernel-doc for all seqlock.h exported APIs.

Signed-off-by: Ahmed S. Darwish <a.darwish@linutronix.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/20200720155530.1173732-6-a.darwish@linutronix.de
---
 include/linux/seqlock.h | 423 ++++++++++++++++++++++++++++++++--------
 1 file changed, 347 insertions(+), 76 deletions(-)

diff --git a/include/linux/seqlock.h b/include/linux/seqlock.h
index 4c1456008d89..85fb3ac93ffb 100644
--- a/include/linux/seqlock.h
+++ b/include/linux/seqlock.h
@@ -75,6 +75,10 @@ static inline void __seqcount_init(seqcount_t *s, const char *name,
 # define SEQCOUNT_DEP_MAP_INIT(lockname) \
 		.dep_map = { .name = #lockname } \
 
+/**
+ * seqcount_init() - runtime initializer for seqcount_t
+ * @s: Pointer to the seqcount_t instance
+ */
 # define seqcount_init(s)				\
 	do {						\
 		static struct lock_class_key __key;	\
@@ -98,13 +102,15 @@ static inline void seqcount_lockdep_reader_access(const seqcount_t *s)
 # define seqcount_lockdep_reader_access(x)
 #endif
 
-#define SEQCNT_ZERO(lockname) { .sequence = 0, SEQCOUNT_DEP_MAP_INIT(lockname)}
-
+/**
+ * SEQCNT_ZERO() - static initializer for seqcount_t
+ * @name: Name of the seqcount_t instance
+ */
+#define SEQCNT_ZERO(name) { .sequence = 0, SEQCOUNT_DEP_MAP_INIT(name) }
 
 /**
- * __read_seqcount_begin - begin a seq-read critical section (without barrier)
- * @s: pointer to seqcount_t
- * Returns: count to be passed to read_seqcount_retry
+ * __read_seqcount_begin() - begin a seqcount_t read section w/o barrier
+ * @s: Pointer to seqcount_t
  *
  * __read_seqcount_begin is like read_seqcount_begin, but has no smp_rmb()
  * barrier. Callers should ensure that smp_rmb() or equivalent ordering is
@@ -113,6 +119,8 @@ static inline void seqcount_lockdep_reader_access(const seqcount_t *s)
  *
  * Use carefully, only in critical code, and comment how the barrier is
  * provided.
+ *
+ * Return: count to be passed to read_seqcount_retry()
  */
 static inline unsigned __read_seqcount_begin(const seqcount_t *s)
 {
@@ -129,13 +137,10 @@ repeat:
 }
 
 /**
- * raw_read_seqcount_begin - start seq-read critical section w/o lockdep
- * @s: pointer to seqcount_t
- * Returns: count to be passed to read_seqcount_retry
+ * raw_read_seqcount_begin() - begin a seqcount_t read section w/o lockdep
+ * @s: Pointer to seqcount_t
  *
- * raw_read_seqcount_begin opens a read critical section of the given
- * seqcount, but without any lockdep checking. Validity of the critical
- * section is tested by checking read_seqcount_retry function.
+ * Return: count to be passed to read_seqcount_retry()
  */
 static inline unsigned raw_read_seqcount_begin(const seqcount_t *s)
 {
@@ -145,13 +150,10 @@ static inline unsigned raw_read_seqcount_begin(const seqcount_t *s)
 }
 
 /**
- * read_seqcount_begin - begin a seq-read critical section
- * @s: pointer to seqcount_t
- * Returns: count to be passed to read_seqcount_retry
+ * read_seqcount_begin() - begin a seqcount_t read critical section
+ * @s: Pointer to seqcount_t
  *
- * read_seqcount_begin opens a read critical section of the given seqcount.
- * Validity of the critical section is tested by checking read_seqcount_retry
- * function.
+ * Return: count to be passed to read_seqcount_retry()
  */
 static inline unsigned read_seqcount_begin(const seqcount_t *s)
 {
@@ -160,13 +162,15 @@ static inline unsigned read_seqcount_begin(const seqcount_t *s)
 }
 
 /**
- * raw_read_seqcount - Read the raw seqcount
- * @s: pointer to seqcount_t
- * Returns: count to be passed to read_seqcount_retry
+ * raw_read_seqcount() - read the raw seqcount_t counter value
+ * @s: Pointer to seqcount_t
  *
  * raw_read_seqcount opens a read critical section of the given
- * seqcount without any lockdep checking and without checking or
- * masking the LSB. Calling code is responsible for handling that.
+ * seqcount_t, without any lockdep checking, and without checking or
+ * masking the sequence counter LSB. Calling code is responsible for
+ * handling that.
+ *
+ * Return: count to be passed to read_seqcount_retry()
  */
 static inline unsigned raw_read_seqcount(const seqcount_t *s)
 {
@@ -177,18 +181,21 @@ static inline unsigned raw_read_seqcount(const seqcount_t *s)
 }
 
 /**
- * raw_seqcount_begin - begin a seq-read critical section
- * @s: pointer to seqcount_t
- * Returns: count to be passed to read_seqcount_retry
+ * raw_seqcount_begin() - begin a seqcount_t read critical section w/o
+ *                        lockdep and w/o counter stabilization
+ * @s: Pointer to seqcount_t
  *
- * raw_seqcount_begin opens a read critical section of the given seqcount.
- * Validity of the critical section is tested by checking read_seqcount_retry
- * function.
+ * raw_seqcount_begin opens a read critical section of the given
+ * seqcount_t. Unlike read_seqcount_begin(), this function will not wait
+ * for the count to stabilize. If a writer is active when it begins, it
+ * will fail the read_seqcount_retry() at the end of the read critical
+ * section instead of stabilizing at the beginning of it.
  *
- * Unlike read_seqcount_begin(), this function will not wait for the count
- * to stabilize. If a writer is active when we begin, we will fail the
- * read_seqcount_retry() instead of stabilizing at the beginning of the
- * critical section.
+ * Use this only in special kernel hot paths where the read section is
+ * small and has a high probability of success through other external
+ * means. It will save a single branching instruction.
+ *
+ * Return: count to be passed to read_seqcount_retry()
  */
 static inline unsigned raw_seqcount_begin(const seqcount_t *s)
 {
@@ -199,10 +206,9 @@ static inline unsigned raw_seqcount_begin(const seqcount_t *s)
 }
 
 /**
- * __read_seqcount_retry - end a seq-read critical section (without barrier)
- * @s: pointer to seqcount_t
- * @start: count, from read_seqcount_begin
- * Returns: 1 if retry is required, else 0
+ * __read_seqcount_retry() - end a seqcount_t read section w/o barrier
+ * @s: Pointer to seqcount_t
+ * @start: count, from read_seqcount_begin()
  *
  * __read_seqcount_retry is like read_seqcount_retry, but has no smp_rmb()
  * barrier. Callers should ensure that smp_rmb() or equivalent ordering is
@@ -211,6 +217,8 @@ static inline unsigned raw_seqcount_begin(const seqcount_t *s)
  *
  * Use carefully, only in critical code, and comment how the barrier is
  * provided.
+ *
+ * Return: true if a read section retry is required, else false
  */
 static inline int __read_seqcount_retry(const seqcount_t *s, unsigned start)
 {
@@ -219,14 +227,15 @@ static inline int __read_seqcount_retry(const seqcount_t *s, unsigned start)
 }
 
 /**
- * read_seqcount_retry - end a seq-read critical section
- * @s: pointer to seqcount_t
- * @start: count, from read_seqcount_begin
- * Returns: 1 if retry is required, else 0
+ * read_seqcount_retry() - end a seqcount_t read critical section
+ * @s: Pointer to seqcount_t
+ * @start: count, from read_seqcount_begin()
  *
- * read_seqcount_retry closes a read critical section of the given seqcount.
- * If the critical section was invalid, it must be ignored (and typically
- * retried).
+ * read_seqcount_retry closes the read critical section of given
+ * seqcount_t.  If the critical section was invalid, it must be ignored
+ * (and typically retried).
+ *
+ * Return: true if a read section retry is required, else false
  */
 static inline int read_seqcount_retry(const seqcount_t *s, unsigned start)
 {
@@ -234,6 +243,10 @@ static inline int read_seqcount_retry(const seqcount_t *s, unsigned start)
 	return __read_seqcount_retry(s, start);
 }
 
+/**
+ * raw_write_seqcount_begin() - start a seqcount_t write section w/o lockdep
+ * @s: Pointer to seqcount_t
+ */
 static inline void raw_write_seqcount_begin(seqcount_t *s)
 {
 	kcsan_nestable_atomic_begin();
@@ -241,6 +254,10 @@ static inline void raw_write_seqcount_begin(seqcount_t *s)
 	smp_wmb();
 }
 
+/**
+ * raw_write_seqcount_end() - end a seqcount_t write section w/o lockdep
+ * @s: Pointer to seqcount_t
+ */
 static inline void raw_write_seqcount_end(seqcount_t *s)
 {
 	smp_wmb();
@@ -248,17 +265,42 @@ static inline void raw_write_seqcount_end(seqcount_t *s)
 	kcsan_nestable_atomic_end();
 }
 
+/**
+ * write_seqcount_begin_nested() - start a seqcount_t write section with
+ *                                 custom lockdep nesting level
+ * @s: Pointer to seqcount_t
+ * @subclass: lockdep nesting level
+ *
+ * See Documentation/locking/lockdep-design.rst
+ */
 static inline void write_seqcount_begin_nested(seqcount_t *s, int subclass)
 {
 	raw_write_seqcount_begin(s);
 	seqcount_acquire(&s->dep_map, subclass, 0, _RET_IP_);
 }
 
+/**
+ * write_seqcount_begin() - start a seqcount_t write side critical section
+ * @s: Pointer to seqcount_t
+ *
+ * write_seqcount_begin opens a write side critical section of the given
+ * seqcount_t.
+ *
+ * Context: seqcount_t write side critical sections must be serialized and
+ * non-preemptible. If readers can be invoked from hardirq or softirq
+ * context, interrupts or bottom halves must be respectively disabled.
+ */
 static inline void write_seqcount_begin(seqcount_t *s)
 {
 	write_seqcount_begin_nested(s, 0);
 }
 
+/**
+ * write_seqcount_end() - end a seqcount_t write side critical section
+ * @s: Pointer to seqcount_t
+ *
+ * The write section must've been opened with write_seqcount_begin().
+ */
 static inline void write_seqcount_end(seqcount_t *s)
 {
 	seqcount_release(&s->dep_map, _RET_IP_);
@@ -266,12 +308,12 @@ static inline void write_seqcount_end(seqcount_t *s)
 }
 
 /**
- * raw_write_seqcount_barrier - do a seq write barrier
- * @s: pointer to seqcount_t
+ * raw_write_seqcount_barrier() - do a seqcount_t write barrier
+ * @s: Pointer to seqcount_t
  *
- * This can be used to provide an ordering guarantee instead of the
- * usual consistency guarantee. It is one wmb cheaper, because we can
- * collapse the two back-to-back wmb()s.
+ * This can be used to provide an ordering guarantee instead of the usual
+ * consistency guarantee. It is one wmb cheaper, because it can collapse
+ * the two back-to-back wmb()s.
  *
  * Note that writes surrounding the barrier should be declared atomic (e.g.
  * via WRITE_ONCE): a) to ensure the writes become visible to other threads
@@ -316,11 +358,12 @@ static inline void raw_write_seqcount_barrier(seqcount_t *s)
 }
 
 /**
- * write_seqcount_invalidate - invalidate in-progress read-side seq operations
- * @s: pointer to seqcount_t
+ * write_seqcount_invalidate() - invalidate in-progress seqcount_t read
+ *                               side operations
+ * @s: Pointer to seqcount_t
  *
- * After write_seqcount_invalidate, no read-side seq operations will complete
- * successfully and see data older than this.
+ * After write_seqcount_invalidate, no seqcount_t read side operations
+ * will complete successfully and see data older than this.
  */
 static inline void write_seqcount_invalidate(seqcount_t *s)
 {
@@ -330,6 +373,21 @@ static inline void write_seqcount_invalidate(seqcount_t *s)
 	kcsan_nestable_atomic_end();
 }
 
+/**
+ * raw_read_seqcount_latch() - pick even/odd seqcount_t latch data copy
+ * @s: Pointer to seqcount_t
+ *
+ * Use seqcount_t latching to switch between two storage places protected
+ * by a sequence counter. Doing so allows having interruptible, preemptible,
+ * seqcount_t write side critical sections.
+ *
+ * Check raw_write_seqcount_latch() for more details and a full reader and
+ * writer usage example.
+ *
+ * Return: sequence counter raw value. Use the lowest bit as an index for
+ * picking which data copy to read. The full counter value must then be
+ * checked with read_seqcount_retry().
+ */
 static inline int raw_read_seqcount_latch(seqcount_t *s)
 {
 	/* Pairs with the first smp_wmb() in raw_write_seqcount_latch() */
@@ -338,8 +396,8 @@ static inline int raw_read_seqcount_latch(seqcount_t *s)
 }
 
 /**
- * raw_write_seqcount_latch - redirect readers to even/odd copy
- * @s: pointer to seqcount_t
+ * raw_write_seqcount_latch() - redirect readers to even/odd copy
+ * @s: Pointer to seqcount_t
  *
  * The latch technique is a multiversion concurrency control method that allows
  * queries during non-atomic modifications. If you can guarantee queries never
@@ -446,17 +504,28 @@ typedef struct {
 		.lock =	__SPIN_LOCK_UNLOCKED(lockname)	\
 	}
 
-#define seqlock_init(x)					\
+/**
+ * seqlock_init() - dynamic initializer for seqlock_t
+ * @sl: Pointer to the seqlock_t instance
+ */
+#define seqlock_init(sl)				\
 	do {						\
-		seqcount_init(&(x)->seqcount);		\
-		spin_lock_init(&(x)->lock);		\
+		seqcount_init(&(sl)->seqcount);		\
+		spin_lock_init(&(sl)->lock);		\
 	} while (0)
 
-#define DEFINE_SEQLOCK(x) \
-		seqlock_t x = __SEQLOCK_UNLOCKED(x)
+/**
+ * DEFINE_SEQLOCK() - Define a statically allocated seqlock_t
+ * @sl: Name of the seqlock_t instance
+ */
+#define DEFINE_SEQLOCK(sl) \
+		seqlock_t sl = __SEQLOCK_UNLOCKED(sl)
 
-/*
- * Read side functions for starting and finalizing a read side section.
+/**
+ * read_seqbegin() - start a seqlock_t read side critical section
+ * @sl: Pointer to seqlock_t
+ *
+ * Return: count, to be passed to read_seqretry()
  */
 static inline unsigned read_seqbegin(const seqlock_t *sl)
 {
@@ -467,6 +536,17 @@ static inline unsigned read_seqbegin(const seqlock_t *sl)
 	return ret;
 }
 
+/**
+ * read_seqretry() - end a seqlock_t read side section
+ * @sl: Pointer to seqlock_t
+ * @start: count, from read_seqbegin()
+ *
+ * read_seqretry closes the read side critical section of given seqlock_t.
+ * If the critical section was invalid, it must be ignored (and typically
+ * retried).
+ *
+ * Return: true if a read section retry is required, else false
+ */
 static inline unsigned read_seqretry(const seqlock_t *sl, unsigned start)
 {
 	/*
@@ -478,10 +558,18 @@ static inline unsigned read_seqretry(const seqlock_t *sl, unsigned start)
 	return read_seqcount_retry(&sl->seqcount, start);
 }
 
-/*
- * Lock out other writers and update the count.
- * Acts like a normal spin_lock/unlock.
- * Don't need preempt_disable() because that is in the spin_lock already.
+/**
+ * write_seqlock() - start a seqlock_t write side critical section
+ * @sl: Pointer to seqlock_t
+ *
+ * write_seqlock opens a write side critical section for the given
+ * seqlock_t.  It also implicitly acquires the spinlock_t embedded inside
+ * that sequential lock. All seqlock_t write side sections are thus
+ * automatically serialized and non-preemptible.
+ *
+ * Context: if the seqlock_t read section, or other write side critical
+ * sections, can be invoked from hardirq or softirq contexts, use the
+ * _irqsave or _bh variants of this function instead.
  */
 static inline void write_seqlock(seqlock_t *sl)
 {
@@ -489,30 +577,66 @@ static inline void write_seqlock(seqlock_t *sl)
 	write_seqcount_begin(&sl->seqcount);
 }
 
+/**
+ * write_sequnlock() - end a seqlock_t write side critical section
+ * @sl: Pointer to seqlock_t
+ *
+ * write_sequnlock closes the (serialized and non-preemptible) write side
+ * critical section of given seqlock_t.
+ */
 static inline void write_sequnlock(seqlock_t *sl)
 {
 	write_seqcount_end(&sl->seqcount);
 	spin_unlock(&sl->lock);
 }
 
+/**
+ * write_seqlock_bh() - start a softirqs-disabled seqlock_t write section
+ * @sl: Pointer to seqlock_t
+ *
+ * _bh variant of write_seqlock(). Use only if the read side section, or
+ * other write side sections, can be invoked from softirq contexts.
+ */
 static inline void write_seqlock_bh(seqlock_t *sl)
 {
 	spin_lock_bh(&sl->lock);
 	write_seqcount_begin(&sl->seqcount);
 }
 
+/**
+ * write_sequnlock_bh() - end a softirqs-disabled seqlock_t write section
+ * @sl: Pointer to seqlock_t
+ *
+ * write_sequnlock_bh closes the serialized, non-preemptible, and
+ * softirqs-disabled, seqlock_t write side critical section opened with
+ * write_seqlock_bh().
+ */
 static inline void write_sequnlock_bh(seqlock_t *sl)
 {
 	write_seqcount_end(&sl->seqcount);
 	spin_unlock_bh(&sl->lock);
 }
 
+/**
+ * write_seqlock_irq() - start a non-interruptible seqlock_t write section
+ * @sl: Pointer to seqlock_t
+ *
+ * _irq variant of write_seqlock(). Use only if the read side section, or
+ * other write sections, can be invoked from hardirq contexts.
+ */
 static inline void write_seqlock_irq(seqlock_t *sl)
 {
 	spin_lock_irq(&sl->lock);
 	write_seqcount_begin(&sl->seqcount);
 }
 
+/**
+ * write_sequnlock_irq() - end a non-interruptible seqlock_t write section
+ * @sl: Pointer to seqlock_t
+ *
+ * write_sequnlock_irq closes the serialized and non-interruptible
+ * seqlock_t write side section opened with write_seqlock_irq().
+ */
 static inline void write_sequnlock_irq(seqlock_t *sl)
 {
 	write_seqcount_end(&sl->seqcount);
@@ -528,9 +652,28 @@ static inline unsigned long __write_seqlock_irqsave(seqlock_t *sl)
 	return flags;
 }
 
+/**
+ * write_seqlock_irqsave() - start a non-interruptible seqlock_t write
+ *                           section
+ * @lock:  Pointer to seqlock_t
+ * @flags: Stack-allocated storage for saving caller's local interrupt
+ *         state, to be passed to write_sequnlock_irqrestore().
+ *
+ * _irqsave variant of write_seqlock(). Use it only if the read side
+ * section, or other write sections, can be invoked from hardirq context.
+ */
 #define write_seqlock_irqsave(lock, flags)				\
 	do { flags = __write_seqlock_irqsave(lock); } while (0)
 
+/**
+ * write_sequnlock_irqrestore() - end non-interruptible seqlock_t write
+ *                                section
+ * @sl:    Pointer to seqlock_t
+ * @flags: Caller's saved interrupt state, from write_seqlock_irqsave()
+ *
+ * write_sequnlock_irqrestore closes the serialized and non-interruptible
+ * seqlock_t write section previously opened with write_seqlock_irqsave().
+ */
 static inline void
 write_sequnlock_irqrestore(seqlock_t *sl, unsigned long flags)
 {
@@ -538,36 +681,79 @@ write_sequnlock_irqrestore(seqlock_t *sl, unsigned long flags)
 	spin_unlock_irqrestore(&sl->lock, flags);
 }
 
-/*
- * A locking reader exclusively locks out other writers and locking readers,
- * but doesn't update the sequence number. Acts like a normal spin_lock/unlock.
- * Don't need preempt_disable() because that is in the spin_lock already.
+/**
+ * read_seqlock_excl() - begin a seqlock_t locking reader section
+ * @sl: Pointer to seqlock_t
+ *
+ * read_seqlock_excl opens a seqlock_t locking reader critical section.  A
+ * locking reader exclusively locks out *both* other writers *and* other
+ * locking readers, but it does not update the embedded sequence number.
+ *
+ * Locking readers act like a normal spin_lock()/spin_unlock().
+ *
+ * Context: if the seqlock_t write section, *or other read sections*, can
+ * be invoked from hardirq or softirq contexts, use the _irqsave or _bh
+ * variant of this function instead.
+ *
+ * The opened read section must be closed with read_sequnlock_excl().
  */
 static inline void read_seqlock_excl(seqlock_t *sl)
 {
 	spin_lock(&sl->lock);
 }
 
+/**
+ * read_sequnlock_excl() - end a seqlock_t locking reader critical section
+ * @sl: Pointer to seqlock_t
+ */
 static inline void read_sequnlock_excl(seqlock_t *sl)
 {
 	spin_unlock(&sl->lock);
 }
 
+/**
+ * read_seqlock_excl_bh() - start a seqlock_t locking reader section with
+ *			    softirqs disabled
+ * @sl: Pointer to seqlock_t
+ *
+ * _bh variant of read_seqlock_excl(). Use this variant only if the
+ * seqlock_t write side section, *or other read sections*, can be invoked
+ * from softirq contexts.
+ */
 static inline void read_seqlock_excl_bh(seqlock_t *sl)
 {
 	spin_lock_bh(&sl->lock);
 }
 
+/**
+ * read_sequnlock_excl_bh() - stop a seqlock_t softirq-disabled locking
+ *			      reader section
+ * @sl: Pointer to seqlock_t
+ */
 static inline void read_sequnlock_excl_bh(seqlock_t *sl)
 {
 	spin_unlock_bh(&sl->lock);
 }
 
+/**
+ * read_seqlock_excl_irq() - start a non-interruptible seqlock_t locking
+ *			     reader section
+ * @sl: Pointer to seqlock_t
+ *
+ * _irq variant of read_seqlock_excl(). Use this only if the seqlock_t
+ * write side section, *or other read sections*, can be invoked from a
+ * hardirq context.
+ */
 static inline void read_seqlock_excl_irq(seqlock_t *sl)
 {
 	spin_lock_irq(&sl->lock);
 }
 
+/**
+ * read_sequnlock_excl_irq() - end an interrupts-disabled seqlock_t
+ *                             locking reader section
+ * @sl: Pointer to seqlock_t
+ */
 static inline void read_sequnlock_excl_irq(seqlock_t *sl)
 {
 	spin_unlock_irq(&sl->lock);
@@ -581,9 +767,26 @@ static inline unsigned long __read_seqlock_excl_irqsave(seqlock_t *sl)
 	return flags;
 }
 
+/**
+ * read_seqlock_excl_irqsave() - start a non-interruptible seqlock_t
+ *				 locking reader section
+ * @lock:  Pointer to seqlock_t
+ * @flags: Stack-allocated storage for saving caller's local interrupt
+ *         state, to be passed to read_sequnlock_excl_irqrestore().
+ *
+ * _irqsave variant of read_seqlock_excl(). Use this only if the seqlock_t
+ * write side section, *or other read sections*, can be invoked from a
+ * hardirq context.
+ */
 #define read_seqlock_excl_irqsave(lock, flags)				\
 	do { flags = __read_seqlock_excl_irqsave(lock); } while (0)
 
+/**
+ * read_sequnlock_excl_irqrestore() - end non-interruptible seqlock_t
+ *				      locking reader section
+ * @sl:    Pointer to seqlock_t
+ * @flags: Caller saved interrupt state, from read_seqlock_excl_irqsave()
+ */
 static inline void
 read_sequnlock_excl_irqrestore(seqlock_t *sl, unsigned long flags)
 {
@@ -591,14 +794,35 @@ read_sequnlock_excl_irqrestore(seqlock_t *sl, unsigned long flags)
 }
 
 /**
- * read_seqbegin_or_lock - begin a sequence number check or locking block
- * @lock: sequence lock
- * @seq : sequence number to be checked
+ * read_seqbegin_or_lock() - begin a seqlock_t lockless or locking reader
+ * @lock: Pointer to seqlock_t
+ * @seq : Marker and return parameter. If the passed value is even, the
+ * reader will become a *lockless* seqlock_t reader as in read_seqbegin().
+ * If the passed value is odd, the reader will become a *locking* reader
+ * as in read_seqlock_excl().  In the first call to this function, the
+ * caller *must* initialize and pass an even value to @seq; this way, a
+ * lockless read can be optimistically tried first.
  *
- * First try it once optimistically without taking the lock. If that fails,
- * take the lock. The sequence number is also used as a marker for deciding
- * whether to be a reader (even) or writer (odd).
- * N.B. seq must be initialized to an even number to begin with.
+ * read_seqbegin_or_lock is an API designed to optimistically try a normal
+ * lockless seqlock_t read section first.  If an odd counter is found, the
+ * lockless read trial has failed, and the next read iteration transforms
+ * itself into a full seqlock_t locking reader.
+ *
+ * This is typically used to avoid seqlock_t lockless readers starvation
+ * (too much retry loops) in the case of a sharp spike in write side
+ * activity.
+ *
+ * Context: if the seqlock_t write section, *or other read sections*, can
+ * be invoked from hardirq or softirq contexts, use the _irqsave or _bh
+ * variant of this function instead.
+ *
+ * Check Documentation/locking/seqlock.rst for template example code.
+ *
+ * Return: the encountered sequence counter value, through the @seq
+ * parameter, which is overloaded as a return parameter. This returned
+ * value must be checked with need_seqretry(). If the read section need to
+ * be retried, this returned value must also be passed as the @seq
+ * parameter of the next read_seqbegin_or_lock() iteration.
  */
 static inline void read_seqbegin_or_lock(seqlock_t *lock, int *seq)
 {
@@ -608,17 +832,52 @@ static inline void read_seqbegin_or_lock(seqlock_t *lock, int *seq)
 		read_seqlock_excl(lock);
 }
 
+/**
+ * need_seqretry() - validate seqlock_t "locking or lockless" read section
+ * @lock: Pointer to seqlock_t
+ * @seq: sequence count, from read_seqbegin_or_lock()
+ *
+ * Return: true if a read section retry is required, false otherwise
+ */
 static inline int need_seqretry(seqlock_t *lock, int seq)
 {
 	return !(seq & 1) && read_seqretry(lock, seq);
 }
 
+/**
+ * done_seqretry() - end seqlock_t "locking or lockless" reader section
+ * @lock: Pointer to seqlock_t
+ * @seq: count, from read_seqbegin_or_lock()
+ *
+ * done_seqretry finishes the seqlock_t read side critical section started
+ * with read_seqbegin_or_lock() and validated by need_seqretry().
+ */
 static inline void done_seqretry(seqlock_t *lock, int seq)
 {
 	if (seq & 1)
 		read_sequnlock_excl(lock);
 }
 
+/**
+ * read_seqbegin_or_lock_irqsave() - begin a seqlock_t lockless reader, or
+ *                                   a non-interruptible locking reader
+ * @lock: Pointer to seqlock_t
+ * @seq:  Marker and return parameter. Check read_seqbegin_or_lock().
+ *
+ * This is the _irqsave variant of read_seqbegin_or_lock(). Use it only if
+ * the seqlock_t write section, *or other read sections*, can be invoked
+ * from hardirq context.
+ *
+ * Note: Interrupts will be disabled only for "locking reader" mode.
+ *
+ * Return:
+ *
+ *   1. The saved local interrupts state in case of a locking reader, to
+ *      be passed to done_seqretry_irqrestore().
+ *
+ *   2. The encountered sequence counter value, returned through @seq
+ *      overloaded as a return parameter. Check read_seqbegin_or_lock().
+ */
 static inline unsigned long
 read_seqbegin_or_lock_irqsave(seqlock_t *lock, int *seq)
 {
@@ -632,6 +891,18 @@ read_seqbegin_or_lock_irqsave(seqlock_t *lock, int *seq)
 	return flags;
 }
 
+/**
+ * done_seqretry_irqrestore() - end a seqlock_t lockless reader, or a
+ *				non-interruptible locking reader section
+ * @lock:  Pointer to seqlock_t
+ * @seq:   Count, from read_seqbegin_or_lock_irqsave()
+ * @flags: Caller's saved local interrupt state in case of a locking
+ *	   reader, also from read_seqbegin_or_lock_irqsave()
+ *
+ * This is the _irqrestore variant of done_seqretry(). The read section
+ * must've been opened with read_seqbegin_or_lock_irqsave(), and validated
+ * by need_seqretry().
+ */
 static inline void
 done_seqretry_irqrestore(seqlock_t *lock, int seq, unsigned long flags)
 {

From 932e46365226324d2cf26d8bdec8b51ceb296948 Mon Sep 17 00:00:00 2001
From: "Ahmed S. Darwish" <a.darwish@linutronix.de>
Date: Mon, 20 Jul 2020 17:55:12 +0200
Subject: [PATCH 484/502] seqlock: Implement raw_seqcount_begin() in terms of
 raw_read_seqcount()

raw_seqcount_begin() has the same code as raw_read_seqcount(), with the
exception of masking the sequence counter's LSB before returning it to
the caller.

Note, raw_seqcount_begin() masks the counter's LSB before returning it
to the caller so that read_seqcount_retry() can fail if the counter is
odd -- without the overhead of an extra branching instruction.

Signed-off-by: Ahmed S. Darwish <a.darwish@linutronix.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/20200720155530.1173732-7-a.darwish@linutronix.de
---
 include/linux/seqlock.h | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/include/linux/seqlock.h b/include/linux/seqlock.h
index 85fb3ac93ffb..e885702d8b82 100644
--- a/include/linux/seqlock.h
+++ b/include/linux/seqlock.h
@@ -199,10 +199,11 @@ static inline unsigned raw_read_seqcount(const seqcount_t *s)
  */
 static inline unsigned raw_seqcount_begin(const seqcount_t *s)
 {
-	unsigned ret = READ_ONCE(s->sequence);
-	smp_rmb();
-	kcsan_atomic_next(KCSAN_SEQLOCK_REGION_MAX);
-	return ret & ~1;
+	/*
+	 * If the counter is odd, let read_seqcount_retry() fail
+	 * by decrementing the counter.
+	 */
+	return raw_read_seqcount(s) & ~1;
 }
 
 /**

From 8fd8ad5c5dfcb09cf62abadd4043eaf1afbbd0ce Mon Sep 17 00:00:00 2001
From: "Ahmed S. Darwish" <a.darwish@linutronix.de>
Date: Mon, 20 Jul 2020 17:55:13 +0200
Subject: [PATCH 485/502] lockdep: Add preemption enabled/disabled assertion
 APIs

Asserting that preemption is enabled or disabled is a critical sanity
check.  Developers are usually reluctant to add such a check in a
fastpath as reading the preemption count can be costly.

Extend the lockdep API with macros asserting that preemption is disabled
or enabled. If lockdep is disabled, or if the underlying architecture
does not support kernel preemption, this assert has no runtime overhead.

References: f54bb2ec02c8 ("locking/lockdep: Add IRQs disabled/enabled assertion APIs: ...")
Signed-off-by: Ahmed S. Darwish <a.darwish@linutronix.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/20200720155530.1173732-8-a.darwish@linutronix.de
---
 include/linux/lockdep.h | 19 +++++++++++++++++++
 lib/Kconfig.debug       |  1 +
 2 files changed, 20 insertions(+)

diff --git a/include/linux/lockdep.h b/include/linux/lockdep.h
index 7aafba0ddcf9..39a35699d0d6 100644
--- a/include/linux/lockdep.h
+++ b/include/linux/lockdep.h
@@ -549,6 +549,22 @@ do {									\
 	WARN_ON_ONCE(debug_locks && !this_cpu_read(hardirq_context));	\
 } while (0)
 
+#define lockdep_assert_preemption_enabled()				\
+do {									\
+	WARN_ON_ONCE(IS_ENABLED(CONFIG_PREEMPT_COUNT)	&&		\
+		     debug_locks			&&		\
+		     (preempt_count() != 0		||		\
+		      !this_cpu_read(hardirqs_enabled)));		\
+} while (0)
+
+#define lockdep_assert_preemption_disabled()				\
+do {									\
+	WARN_ON_ONCE(IS_ENABLED(CONFIG_PREEMPT_COUNT)	&&		\
+		     debug_locks			&&		\
+		     (preempt_count() == 0		&&		\
+		      this_cpu_read(hardirqs_enabled)));		\
+} while (0)
+
 #else
 # define might_lock(lock) do { } while (0)
 # define might_lock_read(lock) do { } while (0)
@@ -557,6 +573,9 @@ do {									\
 # define lockdep_assert_irqs_enabled() do { } while (0)
 # define lockdep_assert_irqs_disabled() do { } while (0)
 # define lockdep_assert_in_irq() do { } while (0)
+
+# define lockdep_assert_preemption_enabled() do { } while (0)
+# define lockdep_assert_preemption_disabled() do { } while (0)
 #endif
 
 #ifdef CONFIG_PROVE_RAW_LOCK_NESTING
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index 9ad9210d70a1..5379931ba3b5 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -1117,6 +1117,7 @@ config PROVE_LOCKING
 	select DEBUG_RWSEMS
 	select DEBUG_WW_MUTEX_SLOWPATH
 	select DEBUG_LOCK_ALLOC
+	select PREEMPT_COUNT if !ARCH_NO_PREEMPT
 	select TRACE_IRQFLAGS
 	default n
 	help

From 859247d39fb008ea812e8f0c398a58a20c12899e Mon Sep 17 00:00:00 2001
From: "Ahmed S. Darwish" <a.darwish@linutronix.de>
Date: Mon, 20 Jul 2020 17:55:14 +0200
Subject: [PATCH 486/502] seqlock: lockdep assert non-preemptibility on
 seqcount_t write

Preemption must be disabled before entering a sequence count write side
critical section.  Failing to do so, the seqcount read side can preempt
the write side section and spin for the entire scheduler tick.  If that
reader belongs to a real-time scheduling class, it can spin forever and
the kernel will livelock.

Assert through lockdep that preemption is disabled for seqcount writers.

Signed-off-by: Ahmed S. Darwish <a.darwish@linutronix.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/20200720155530.1173732-9-a.darwish@linutronix.de
---
 include/linux/seqlock.h | 29 +++++++++++++++++++++++------
 1 file changed, 23 insertions(+), 6 deletions(-)

diff --git a/include/linux/seqlock.h b/include/linux/seqlock.h
index e885702d8b82..54bc20496392 100644
--- a/include/linux/seqlock.h
+++ b/include/linux/seqlock.h
@@ -266,6 +266,12 @@ static inline void raw_write_seqcount_end(seqcount_t *s)
 	kcsan_nestable_atomic_end();
 }
 
+static inline void __write_seqcount_begin_nested(seqcount_t *s, int subclass)
+{
+	raw_write_seqcount_begin(s);
+	seqcount_acquire(&s->dep_map, subclass, 0, _RET_IP_);
+}
+
 /**
  * write_seqcount_begin_nested() - start a seqcount_t write section with
  *                                 custom lockdep nesting level
@@ -276,8 +282,19 @@ static inline void raw_write_seqcount_end(seqcount_t *s)
  */
 static inline void write_seqcount_begin_nested(seqcount_t *s, int subclass)
 {
-	raw_write_seqcount_begin(s);
-	seqcount_acquire(&s->dep_map, subclass, 0, _RET_IP_);
+	lockdep_assert_preemption_disabled();
+	__write_seqcount_begin_nested(s, subclass);
+}
+
+/*
+ * A write_seqcount_begin() variant w/o lockdep non-preemptibility checks.
+ *
+ * Use for internal seqlock.h code where it's known that preemption is
+ * already disabled. For example, seqlock_t write side functions.
+ */
+static inline void __write_seqcount_begin(seqcount_t *s)
+{
+	__write_seqcount_begin_nested(s, 0);
 }
 
 /**
@@ -575,7 +592,7 @@ static inline unsigned read_seqretry(const seqlock_t *sl, unsigned start)
 static inline void write_seqlock(seqlock_t *sl)
 {
 	spin_lock(&sl->lock);
-	write_seqcount_begin(&sl->seqcount);
+	__write_seqcount_begin(&sl->seqcount);
 }
 
 /**
@@ -601,7 +618,7 @@ static inline void write_sequnlock(seqlock_t *sl)
 static inline void write_seqlock_bh(seqlock_t *sl)
 {
 	spin_lock_bh(&sl->lock);
-	write_seqcount_begin(&sl->seqcount);
+	__write_seqcount_begin(&sl->seqcount);
 }
 
 /**
@@ -628,7 +645,7 @@ static inline void write_sequnlock_bh(seqlock_t *sl)
 static inline void write_seqlock_irq(seqlock_t *sl)
 {
 	spin_lock_irq(&sl->lock);
-	write_seqcount_begin(&sl->seqcount);
+	__write_seqcount_begin(&sl->seqcount);
 }
 
 /**
@@ -649,7 +666,7 @@ static inline unsigned long __write_seqlock_irqsave(seqlock_t *sl)
 	unsigned long flags;
 
 	spin_lock_irqsave(&sl->lock, flags);
-	write_seqcount_begin(&sl->seqcount);
+	__write_seqcount_begin(&sl->seqcount);
 	return flags;
 }
 

From c4334d576cf420a7d0f4349ce0b0a8ed0de3938f Mon Sep 17 00:00:00 2001
From: Randy Dunlap <rdunlap@infradead.org>
Date: Sat, 25 Jul 2020 17:32:05 -0700
Subject: [PATCH 487/502] arm64: pgtable-hwdef.h: delete duplicated words

Drop the repeated words "at" and "the".

Signed-off-by: Randy Dunlap <rdunlap@infradead.org>
Cc: Will Deacon <will@kernel.org>
Cc: linux-arm-kernel@lists.infradead.org
Link: https://lore.kernel.org/r/20200726003207.20253-2-rdunlap@infradead.org
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 arch/arm64/include/asm/pgtable-hwdef.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/arm64/include/asm/pgtable-hwdef.h b/arch/arm64/include/asm/pgtable-hwdef.h
index 9c91a8f93a0e..b18ba4452873 100644
--- a/arch/arm64/include/asm/pgtable-hwdef.h
+++ b/arch/arm64/include/asm/pgtable-hwdef.h
@@ -29,7 +29,7 @@
  * Size mapped by an entry at level n ( 0 <= n <= 3)
  * We map (PAGE_SHIFT - 3) at all translation levels and PAGE_SHIFT bits
  * in the final page. The maximum number of translation levels supported by
- * the architecture is 4. Hence, starting at at level n, we have further
+ * the architecture is 4. Hence, starting at level n, we have further
  * ((4 - n) - 1) levels of translation excluding the offset within the page.
  * So, the total number of bits mapped by an entry at level n is :
  *
@@ -98,7 +98,7 @@
 #define CONT_PMDS		(1 << CONT_PMD_SHIFT)
 #define CONT_PMD_SIZE		(CONT_PMDS * PMD_SIZE)
 #define CONT_PMD_MASK		(~(CONT_PMD_SIZE - 1))
-/* the the numerical offset of the PTE within a range of CONT_PTES */
+/* the numerical offset of the PTE within a range of CONT_PTES */
 #define CONT_RANGE_OFFSET(addr) (((addr)>>PAGE_SHIFT)&(CONT_PTES-1))
 
 /*

From c4b5abba008399dc4450ab6f62b2deb5acd3697e Mon Sep 17 00:00:00 2001
From: Randy Dunlap <rdunlap@infradead.org>
Date: Sat, 25 Jul 2020 17:32:06 -0700
Subject: [PATCH 488/502] arm64: ptrace.h: delete duplicated word

Drop the repeated word "the".

Signed-off-by: Randy Dunlap <rdunlap@infradead.org>
Cc: Will Deacon <will@kernel.org>
Cc: linux-arm-kernel@lists.infradead.org
Link: https://lore.kernel.org/r/20200726003207.20253-3-rdunlap@infradead.org
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 arch/arm64/include/asm/ptrace.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/arm64/include/asm/ptrace.h b/arch/arm64/include/asm/ptrace.h
index 953b6a1ce549..966ed30ed5f7 100644
--- a/arch/arm64/include/asm/ptrace.h
+++ b/arch/arm64/include/asm/ptrace.h
@@ -27,7 +27,7 @@
  *
  * Some code sections either automatically switch back to PSR.I or explicitly
  * require to not use priority masking. If bit GIC_PRIO_PSR_I_SET is included
- * in the  the priority mask, it indicates that PSR.I should be set and
+ * in the priority mask, it indicates that PSR.I should be set and
  * interrupt disabling temporarily does not rely on IRQ priorities.
  */
 #define GIC_PRIO_IRQON			0xe0

From 1a9ea25d1874ca457a596738b40fa4f3bec6fc8f Mon Sep 17 00:00:00 2001
From: Randy Dunlap <rdunlap@infradead.org>
Date: Sat, 25 Jul 2020 17:32:07 -0700
Subject: [PATCH 489/502] arm64: sigcontext.h: delete duplicated word

Drop the repeated word "the".

Signed-off-by: Randy Dunlap <rdunlap@infradead.org>
Cc: Will Deacon <will@kernel.org>
Cc: linux-arm-kernel@lists.infradead.org
Link: https://lore.kernel.org/r/20200726003207.20253-4-rdunlap@infradead.org
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 arch/arm64/include/uapi/asm/sigcontext.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/arm64/include/uapi/asm/sigcontext.h b/arch/arm64/include/uapi/asm/sigcontext.h
index 8b0ebce92427..0c796c795dbe 100644
--- a/arch/arm64/include/uapi/asm/sigcontext.h
+++ b/arch/arm64/include/uapi/asm/sigcontext.h
@@ -179,7 +179,7 @@ struct sve_context {
  * The same convention applies when returning from a signal: a caller
  * will need to remove or resize the sve_context block if it wants to
  * make the SVE registers live when they were previously non-live or
- * vice-versa.  This may require the the caller to allocate fresh
+ * vice-versa.  This may require the caller to allocate fresh
  * memory and/or move other context blocks in the signal frame.
  *
  * Changing the vector length during signal return is not permitted:

From c4885bbb3afee80f41d39a33e49881a18e500f47 Mon Sep 17 00:00:00 2001
From: Pingfan Liu <kernelfans@gmail.com>
Date: Fri, 10 Jul 2020 22:04:12 +0800
Subject: [PATCH 490/502] arm64/mm: save memory access in
 check_and_switch_context() fast switch path

On arm64, smp_processor_id() reads a per-cpu `cpu_number` variable,
using the per-cpu offset stored in the tpidr_el1 system register. In
some cases we generate a per-cpu address with a sequence like:

  cpu_ptr = &per_cpu(ptr, smp_processor_id());

Which potentially incurs a cache miss for both `cpu_number` and the
in-memory `__per_cpu_offset` array. This can be written more optimally
as:

  cpu_ptr = this_cpu_ptr(ptr);

Which only needs the offset from tpidr_el1, and does not need to
load from memory.

The following two test cases show a small performance improvement measured
on a 46-cpus qualcomm machine with 5.8.0-rc4 kernel.

Test 1: (about 0.3% improvement)
    #cat b.sh
    make clean && make all -j138
    #perf stat --repeat 10 --null --sync sh b.sh

    - before this patch
     Performance counter stats for 'sh b.sh' (10 runs):

                298.62 +- 1.86 seconds time elapsed  ( +-  0.62% )

    - after this patch
     Performance counter stats for 'sh b.sh' (10 runs):

               297.734 +- 0.954 seconds time elapsed  ( +-  0.32% )

Test 2: (about 1.69% improvement)
     'perf stat -r 10 perf bench sched messaging'
        Then sum the total time of 'sched/messaging' by manual.

    - before this patch
      total 0.707 sec for 10 times
    - after this patch
      totol 0.695 sec for 10 times

Signed-off-by: Pingfan Liu <kernelfans@gmail.com>
Acked-by: Mark Rutland <mark.rutland@arm.com>
Cc: Will Deacon <will@kernel.org>
Cc: Steve Capper <steve.capper@arm.com>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Vladimir Murzin <vladimir.murzin@arm.com>
Cc: Jean-Philippe Brucker <jean-philippe@linaro.org>
Link: https://lore.kernel.org/r/1594389852-19949-1-git-send-email-kernelfans@gmail.com
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 arch/arm64/include/asm/mmu_context.h |  6 ++----
 arch/arm64/mm/context.c              | 10 ++++++----
 2 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/arch/arm64/include/asm/mmu_context.h b/arch/arm64/include/asm/mmu_context.h
index b0bd9b55594c..f2d7537d6f83 100644
--- a/arch/arm64/include/asm/mmu_context.h
+++ b/arch/arm64/include/asm/mmu_context.h
@@ -175,7 +175,7 @@ static inline void cpu_replace_ttbr1(pgd_t *pgdp)
  * take CPU migration into account.
  */
 #define destroy_context(mm)		do { } while(0)
-void check_and_switch_context(struct mm_struct *mm, unsigned int cpu);
+void check_and_switch_context(struct mm_struct *mm);
 
 #define init_new_context(tsk,mm)	({ atomic64_set(&(mm)->context.id, 0); 0; })
 
@@ -214,8 +214,6 @@ enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk)
 
 static inline void __switch_mm(struct mm_struct *next)
 {
-	unsigned int cpu = smp_processor_id();
-
 	/*
 	 * init_mm.pgd does not contain any user mappings and it is always
 	 * active for kernel addresses in TTBR1. Just set the reserved TTBR0.
@@ -225,7 +223,7 @@ static inline void __switch_mm(struct mm_struct *next)
 		return;
 	}
 
-	check_and_switch_context(next, cpu);
+	check_and_switch_context(next);
 }
 
 static inline void
diff --git a/arch/arm64/mm/context.c b/arch/arm64/mm/context.c
index d702d60e64da..a206655a39a5 100644
--- a/arch/arm64/mm/context.c
+++ b/arch/arm64/mm/context.c
@@ -198,9 +198,10 @@ set_asid:
 	return idx2asid(asid) | generation;
 }
 
-void check_and_switch_context(struct mm_struct *mm, unsigned int cpu)
+void check_and_switch_context(struct mm_struct *mm)
 {
 	unsigned long flags;
+	unsigned int cpu;
 	u64 asid, old_active_asid;
 
 	if (system_supports_cnp())
@@ -222,9 +223,9 @@ void check_and_switch_context(struct mm_struct *mm, unsigned int cpu)
 	 *   relaxed xchg in flush_context will treat us as reserved
 	 *   because atomic RmWs are totally ordered for a given location.
 	 */
-	old_active_asid = atomic64_read(&per_cpu(active_asids, cpu));
+	old_active_asid = atomic64_read(this_cpu_ptr(&active_asids));
 	if (old_active_asid && asid_gen_match(asid) &&
-	    atomic64_cmpxchg_relaxed(&per_cpu(active_asids, cpu),
+	    atomic64_cmpxchg_relaxed(this_cpu_ptr(&active_asids),
 				     old_active_asid, asid))
 		goto switch_mm_fastpath;
 
@@ -236,10 +237,11 @@ void check_and_switch_context(struct mm_struct *mm, unsigned int cpu)
 		atomic64_set(&mm->context.id, asid);
 	}
 
+	cpu = smp_processor_id();
 	if (cpumask_test_and_clear_cpu(cpu, &tlb_flush_pending))
 		local_flush_tlb_all();
 
-	atomic64_set(&per_cpu(active_asids, cpu), asid);
+	atomic64_set(this_cpu_ptr(&active_asids), asid);
 	raw_spin_unlock_irqrestore(&cpu_asid_lock, flags);
 
 switch_mm_fastpath:

From 010e8e6be2194678f7e4bb3044c088bbee779f57 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Thu, 30 Jul 2020 18:43:45 +0300
Subject: [PATCH 491/502] io_uring: de-unionise io_kiocb

As io_kiocb have enough space, move ->work out of a union. It's safer
this way and removes ->work memcpy bouncing.
By the way make tabulation in struct io_kiocb consistent.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 57 ++++++++++++---------------------------------------
 1 file changed, 13 insertions(+), 44 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 3e406bc1f855..86ec5669fe50 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -600,7 +600,6 @@ enum {
 struct async_poll {
 	struct io_poll_iocb	poll;
 	struct io_poll_iocb	*double_poll;
-	struct io_wq_work	work;
 };
 
 /*
@@ -641,36 +640,26 @@ struct io_kiocb {
 	u16				buf_index;
 	u32				result;
 
-	struct io_ring_ctx	*ctx;
-	unsigned int		flags;
-	refcount_t		refs;
-	struct task_struct	*task;
-	u64			user_data;
+	struct io_ring_ctx		*ctx;
+	unsigned int			flags;
+	refcount_t			refs;
+	struct task_struct		*task;
+	u64				user_data;
 
-	struct list_head	link_list;
+	struct list_head		link_list;
 
 	/*
 	 * 1. used with ctx->iopoll_list with reads/writes
 	 * 2. to track reqs with ->files (see io_op_def::file_table)
 	 */
-	struct list_head	inflight_entry;
+	struct list_head		inflight_entry;
 
-	struct percpu_ref	*fixed_file_refs;
-
-	union {
-		/*
-		 * Only commands that never go async can use the below fields,
-		 * obviously. Right now only IORING_OP_POLL_ADD uses them, and
-		 * async armed poll handlers for regular commands. The latter
-		 * restore the work, if needed.
-		 */
-		struct {
-			struct hlist_node	hash_node;
-			struct async_poll	*apoll;
-		};
-		struct io_wq_work	work;
-	};
-	struct callback_head	task_work;
+	struct percpu_ref		*fixed_file_refs;
+	struct callback_head		task_work;
+	/* for polled requests, i.e. IORING_OP_POLL_ADD and async armed poll */
+	struct hlist_node		hash_node;
+	struct async_poll		*apoll;
+	struct io_wq_work		work;
 };
 
 struct io_defer_entry {
@@ -4668,10 +4657,6 @@ static void io_async_task_func(struct callback_head *cb)
 	io_poll_remove_double(req, apoll->double_poll);
 	spin_unlock_irq(&ctx->completion_lock);
 
-	/* restore ->work in case we need to retry again */
-	if (req->flags & REQ_F_WORK_INITIALIZED)
-		memcpy(&req->work, &apoll->work, sizeof(req->work));
-
 	if (!READ_ONCE(apoll->poll.canceled))
 		__io_req_task_submit(req);
 	else
@@ -4763,9 +4748,6 @@ static bool io_arm_poll_handler(struct io_kiocb *req)
 	apoll->double_poll = NULL;
 
 	req->flags |= REQ_F_POLLED;
-	if (req->flags & REQ_F_WORK_INITIALIZED)
-		memcpy(&apoll->work, &req->work, sizeof(req->work));
-
 	io_get_req_task(req);
 	req->apoll = apoll;
 	INIT_HLIST_NODE(&req->hash_node);
@@ -4784,8 +4766,6 @@ static bool io_arm_poll_handler(struct io_kiocb *req)
 	if (ret) {
 		io_poll_remove_double(req, apoll->double_poll);
 		spin_unlock_irq(&ctx->completion_lock);
-		if (req->flags & REQ_F_WORK_INITIALIZED)
-			memcpy(&req->work, &apoll->work, sizeof(req->work));
 		kfree(apoll->double_poll);
 		kfree(apoll);
 		return false;
@@ -4828,14 +4808,6 @@ static bool io_poll_remove_one(struct io_kiocb *req)
 		do_complete = __io_poll_remove_one(req, &apoll->poll);
 		if (do_complete) {
 			io_put_req(req);
-			/*
-			 * restore ->work because we will call
-			 * io_req_clean_work below when dropping the
-			 * final reference.
-			 */
-			if (req->flags & REQ_F_WORK_INITIALIZED)
-				memcpy(&req->work, &apoll->work,
-				       sizeof(req->work));
 			kfree(apoll->double_poll);
 			kfree(apoll);
 		}
@@ -4969,9 +4941,6 @@ static int io_poll_add(struct io_kiocb *req)
 	struct io_poll_table ipt;
 	__poll_t mask;
 
-	/* ->work is in union with hash_node and others */
-	io_req_clean_work(req);
-
 	INIT_HLIST_NODE(&req->hash_node);
 	ipt.pt._qproc = io_poll_queue_proc;
 

From 81b68a5ca0ab5d92229a7b76332b9ce88bd6dbd1 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Thu, 30 Jul 2020 18:43:46 +0300
Subject: [PATCH 492/502] io_uring: deduplicate __io_complete_rw()

Call __io_complete_rw() in io_iopoll_queue() instead of hand coding it.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 86ec5669fe50..11f4ab87e08f 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -891,7 +891,8 @@ enum io_mem_account {
 	ACCT_PINNED,
 };
 
-static bool io_rw_reissue(struct io_kiocb *req, long res);
+static void __io_complete_rw(struct io_kiocb *req, long res, long res2,
+			     struct io_comp_state *cs);
 static void io_cqring_fill_event(struct io_kiocb *req, long res);
 static void io_put_req(struct io_kiocb *req);
 static void io_double_put_req(struct io_kiocb *req);
@@ -902,8 +903,6 @@ static int __io_sqe_files_update(struct io_ring_ctx *ctx,
 				 struct io_uring_files_update *ip,
 				 unsigned nr_args);
 static int io_prep_work_files(struct io_kiocb *req);
-static void io_complete_rw_common(struct kiocb *kiocb, long res,
-				  struct io_comp_state *cs);
 static void __io_clean_op(struct io_kiocb *req);
 static int io_file_get(struct io_submit_state *state, struct io_kiocb *req,
 		       int fd, struct file **out_file, bool fixed);
@@ -1976,8 +1975,7 @@ static void io_iopoll_queue(struct list_head *again)
 	do {
 		req = list_first_entry(again, struct io_kiocb, inflight_entry);
 		list_del(&req->inflight_entry);
-		if (!io_rw_reissue(req, -EAGAIN))
-			io_complete_rw_common(&req->rw.kiocb, -EAGAIN, NULL);
+		__io_complete_rw(req, -EAGAIN, 0, NULL);
 	} while (!list_empty(again));
 }
 

From b2bd1cf99f3e7c8fbf12ea07af2c6998e1209e25 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Thu, 30 Jul 2020 18:43:47 +0300
Subject: [PATCH 493/502] io_uring: fix racy overflow count reporting

All ->cq_overflow modifications should be under completion_lock,
otherwise it can report a wrong number to the userspace. Fix it in
io_uring_cancel_files().

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 11f4ab87e08f..6e2322525da6 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -7847,10 +7847,9 @@ static void io_uring_cancel_files(struct io_ring_ctx *ctx,
 				clear_bit(0, &ctx->cq_check_overflow);
 				ctx->rings->sq_flags &= ~IORING_SQ_CQ_OVERFLOW;
 			}
-			spin_unlock_irq(&ctx->completion_lock);
-
 			WRITE_ONCE(ctx->rings->cq_overflow,
 				atomic_inc_return(&ctx->cached_cq_overflow));
+			spin_unlock_irq(&ctx->completion_lock);
 
 			/*
 			 * Put inflight ref and overflow ref. If that's

From dd9dfcdf5a603680458f5e7b0d2273c66e5417db Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Thu, 30 Jul 2020 18:43:48 +0300
Subject: [PATCH 494/502] io_uring: fix stalled deferred requests

Always do io_commit_cqring() after completing a request, even if it was
accounted as overflowed on the CQ side. Failing to do that may lead to
not to pushing deferred requests when needed, and so stalling the whole
ring.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 6e2322525da6..11c1abe8bd1a 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -7849,6 +7849,7 @@ static void io_uring_cancel_files(struct io_ring_ctx *ctx,
 			}
 			WRITE_ONCE(ctx->rings->cq_overflow,
 				atomic_inc_return(&ctx->cached_cq_overflow));
+			io_commit_cqring(ctx);
 			spin_unlock_irq(&ctx->completion_lock);
 
 			/*

From 4693014340808e7f099e302c1dc40e9d79ff7667 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Thu, 30 Jul 2020 18:43:49 +0300
Subject: [PATCH 495/502] io_uring: consolidate *_check_overflow accounting

Add a helper to mark ctx->{cq,sq}_check_overflow to get rid of
duplicates, and it's clearer to check cq_overflow_list directly anyway.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 23 +++++++++++++----------
 1 file changed, 13 insertions(+), 10 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 11c1abe8bd1a..efec290c6b08 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -1303,6 +1303,15 @@ static void io_cqring_ev_posted(struct io_ring_ctx *ctx)
 		eventfd_signal(ctx->cq_ev_fd, 1);
 }
 
+static void io_cqring_mark_overflow(struct io_ring_ctx *ctx)
+{
+	if (list_empty(&ctx->cq_overflow_list)) {
+		clear_bit(0, &ctx->sq_check_overflow);
+		clear_bit(0, &ctx->cq_check_overflow);
+		ctx->rings->sq_flags &= ~IORING_SQ_CQ_OVERFLOW;
+	}
+}
+
 /* Returns true if there are no backlogged entries after the flush */
 static bool io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force)
 {
@@ -1347,11 +1356,8 @@ static bool io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force)
 	}
 
 	io_commit_cqring(ctx);
-	if (cqe) {
-		clear_bit(0, &ctx->sq_check_overflow);
-		clear_bit(0, &ctx->cq_check_overflow);
-		ctx->rings->sq_flags &= ~IORING_SQ_CQ_OVERFLOW;
-	}
+	io_cqring_mark_overflow(ctx);
+
 	spin_unlock_irqrestore(&ctx->completion_lock, flags);
 	io_cqring_ev_posted(ctx);
 
@@ -7842,11 +7848,8 @@ static void io_uring_cancel_files(struct io_ring_ctx *ctx,
 			spin_lock_irq(&ctx->completion_lock);
 			list_del(&cancel_req->compl.list);
 			cancel_req->flags &= ~REQ_F_OVERFLOW;
-			if (list_empty(&ctx->cq_overflow_list)) {
-				clear_bit(0, &ctx->sq_check_overflow);
-				clear_bit(0, &ctx->cq_check_overflow);
-				ctx->rings->sq_flags &= ~IORING_SQ_CQ_OVERFLOW;
-			}
+
+			io_cqring_mark_overflow(ctx);
 			WRITE_ONCE(ctx->rings->cq_overflow,
 				atomic_inc_return(&ctx->cached_cq_overflow));
 			io_commit_cqring(ctx);

From 01cec8c18f5ad9c27eee9f21439072832181039e Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Thu, 30 Jul 2020 18:43:50 +0300
Subject: [PATCH 496/502] io_uring: get rid of atomic FAA for cq_timeouts

If ->cq_timeouts modifications are done under ->completion_lock, we
don't really nee any fetch-and-add and other complex atomics. Replace it
with non-atomic FAA, that saves an implicit full memory barrier.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index efec290c6b08..fabf0b692384 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -1205,7 +1205,8 @@ static void io_kill_timeout(struct io_kiocb *req)
 
 	ret = hrtimer_try_to_cancel(&req->io->timeout.timer);
 	if (ret != -1) {
-		atomic_inc(&req->ctx->cq_timeouts);
+		atomic_set(&req->ctx->cq_timeouts,
+			atomic_read(&req->ctx->cq_timeouts) + 1);
 		list_del_init(&req->timeout.list);
 		req->flags |= REQ_F_COMP_LOCKED;
 		io_cqring_fill_event(req, 0);
@@ -4972,9 +4973,10 @@ static enum hrtimer_restart io_timeout_fn(struct hrtimer *timer)
 	struct io_ring_ctx *ctx = req->ctx;
 	unsigned long flags;
 
-	atomic_inc(&ctx->cq_timeouts);
-
 	spin_lock_irqsave(&ctx->completion_lock, flags);
+	atomic_set(&req->ctx->cq_timeouts,
+		atomic_read(&req->ctx->cq_timeouts) + 1);
+
 	/*
 	 * We could be racing with timeout deletion. If the list is empty,
 	 * then timeout lookup already found it and will be handling it.

From 0584df9c12f449124d0bfef9899e5365604ee7a9 Mon Sep 17 00:00:00 2001
From: Marco Elver <elver@google.com>
Date: Wed, 29 Jul 2020 13:09:15 +0200
Subject: [PATCH 497/502] lockdep: Refactor IRQ trace events fields into struct

Refactor the IRQ trace events fields, used for printing information
about the IRQ trace events, into a separate struct 'irqtrace_events'.

This improves readability by separating the information only used in
reporting, as well as enables (simplified) storing/restoring of
irqtrace_events snapshots.

No functional change intended.

Signed-off-by: Marco Elver <elver@google.com>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Link: https://lore.kernel.org/r/20200729110916.3920464-1-elver@google.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/irqflags.h | 13 +++++++++
 include/linux/sched.h    | 11 ++------
 kernel/fork.c            | 16 ++++-------
 kernel/locking/lockdep.c | 58 +++++++++++++++++++++-------------------
 4 files changed, 50 insertions(+), 48 deletions(-)

diff --git a/include/linux/irqflags.h b/include/linux/irqflags.h
index 5811ee8a5cd8..bd5c55755447 100644
--- a/include/linux/irqflags.h
+++ b/include/linux/irqflags.h
@@ -33,6 +33,19 @@
 
 #ifdef CONFIG_TRACE_IRQFLAGS
 
+/* Per-task IRQ trace events information. */
+struct irqtrace_events {
+	unsigned int	irq_events;
+	unsigned long	hardirq_enable_ip;
+	unsigned long	hardirq_disable_ip;
+	unsigned int	hardirq_enable_event;
+	unsigned int	hardirq_disable_event;
+	unsigned long	softirq_disable_ip;
+	unsigned long	softirq_enable_ip;
+	unsigned int	softirq_disable_event;
+	unsigned int	softirq_enable_event;
+};
+
 DECLARE_PER_CPU(int, hardirqs_enabled);
 DECLARE_PER_CPU(int, hardirq_context);
 
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 8d1de021b315..52e0fdd6a555 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -18,6 +18,7 @@
 #include <linux/mutex.h>
 #include <linux/plist.h>
 #include <linux/hrtimer.h>
+#include <linux/irqflags.h>
 #include <linux/seccomp.h>
 #include <linux/nodemask.h>
 #include <linux/rcupdate.h>
@@ -980,17 +981,9 @@ struct task_struct {
 #endif
 
 #ifdef CONFIG_TRACE_IRQFLAGS
-	unsigned int			irq_events;
+	struct irqtrace_events		irqtrace;
 	unsigned int			hardirq_threaded;
-	unsigned long			hardirq_enable_ip;
-	unsigned long			hardirq_disable_ip;
-	unsigned int			hardirq_enable_event;
-	unsigned int			hardirq_disable_event;
 	u64				hardirq_chain_key;
-	unsigned long			softirq_disable_ip;
-	unsigned long			softirq_enable_ip;
-	unsigned int			softirq_disable_event;
-	unsigned int			softirq_enable_event;
 	int				softirqs_enabled;
 	int				softirq_context;
 	int				irq_config;
diff --git a/kernel/fork.c b/kernel/fork.c
index 70d9d0a4de2a..56a640799680 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -2035,17 +2035,11 @@ static __latent_entropy struct task_struct *copy_process(
 	seqcount_init(&p->mems_allowed_seq);
 #endif
 #ifdef CONFIG_TRACE_IRQFLAGS
-	p->irq_events = 0;
-	p->hardirq_enable_ip = 0;
-	p->hardirq_enable_event = 0;
-	p->hardirq_disable_ip = _THIS_IP_;
-	p->hardirq_disable_event = 0;
-	p->softirqs_enabled = 1;
-	p->softirq_enable_ip = _THIS_IP_;
-	p->softirq_enable_event = 0;
-	p->softirq_disable_ip = 0;
-	p->softirq_disable_event = 0;
-	p->softirq_context = 0;
+	memset(&p->irqtrace, 0, sizeof(p->irqtrace));
+	p->irqtrace.hardirq_disable_ip	= _THIS_IP_;
+	p->irqtrace.softirq_enable_ip	= _THIS_IP_;
+	p->softirqs_enabled		= 1;
+	p->softirq_context		= 0;
 #endif
 
 	p->pagefault_disabled = 0;
diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
index c9ea05edce25..7b5800374c40 100644
--- a/kernel/locking/lockdep.c
+++ b/kernel/locking/lockdep.c
@@ -3484,19 +3484,21 @@ check_usage_backwards(struct task_struct *curr, struct held_lock *this,
 
 void print_irqtrace_events(struct task_struct *curr)
 {
-	printk("irq event stamp: %u\n", curr->irq_events);
+	const struct irqtrace_events *trace = &curr->irqtrace;
+
+	printk("irq event stamp: %u\n", trace->irq_events);
 	printk("hardirqs last  enabled at (%u): [<%px>] %pS\n",
-		curr->hardirq_enable_event, (void *)curr->hardirq_enable_ip,
-		(void *)curr->hardirq_enable_ip);
+		trace->hardirq_enable_event, (void *)trace->hardirq_enable_ip,
+		(void *)trace->hardirq_enable_ip);
 	printk("hardirqs last disabled at (%u): [<%px>] %pS\n",
-		curr->hardirq_disable_event, (void *)curr->hardirq_disable_ip,
-		(void *)curr->hardirq_disable_ip);
+		trace->hardirq_disable_event, (void *)trace->hardirq_disable_ip,
+		(void *)trace->hardirq_disable_ip);
 	printk("softirqs last  enabled at (%u): [<%px>] %pS\n",
-		curr->softirq_enable_event, (void *)curr->softirq_enable_ip,
-		(void *)curr->softirq_enable_ip);
+		trace->softirq_enable_event, (void *)trace->softirq_enable_ip,
+		(void *)trace->softirq_enable_ip);
 	printk("softirqs last disabled at (%u): [<%px>] %pS\n",
-		curr->softirq_disable_event, (void *)curr->softirq_disable_ip,
-		(void *)curr->softirq_disable_ip);
+		trace->softirq_disable_event, (void *)trace->softirq_disable_ip,
+		(void *)trace->softirq_disable_ip);
 }
 
 static int HARDIRQ_verbose(struct lock_class *class)
@@ -3699,7 +3701,7 @@ EXPORT_SYMBOL_GPL(lockdep_hardirqs_on_prepare);
 
 void noinstr lockdep_hardirqs_on(unsigned long ip)
 {
-	struct task_struct *curr = current;
+	struct irqtrace_events *trace = &current->irqtrace;
 
 	if (unlikely(!debug_locks))
 		return;
@@ -3752,8 +3754,8 @@ void noinstr lockdep_hardirqs_on(unsigned long ip)
 skip_checks:
 	/* we'll do an OFF -> ON transition: */
 	this_cpu_write(hardirqs_enabled, 1);
-	curr->hardirq_enable_ip = ip;
-	curr->hardirq_enable_event = ++curr->irq_events;
+	trace->hardirq_enable_ip = ip;
+	trace->hardirq_enable_event = ++trace->irq_events;
 	debug_atomic_inc(hardirqs_on_events);
 }
 EXPORT_SYMBOL_GPL(lockdep_hardirqs_on);
@@ -3763,8 +3765,6 @@ EXPORT_SYMBOL_GPL(lockdep_hardirqs_on);
  */
 void noinstr lockdep_hardirqs_off(unsigned long ip)
 {
-	struct task_struct *curr = current;
-
 	if (unlikely(!debug_locks))
 		return;
 
@@ -3784,12 +3784,14 @@ void noinstr lockdep_hardirqs_off(unsigned long ip)
 		return;
 
 	if (lockdep_hardirqs_enabled()) {
+		struct irqtrace_events *trace = &current->irqtrace;
+
 		/*
 		 * We have done an ON -> OFF transition:
 		 */
 		this_cpu_write(hardirqs_enabled, 0);
-		curr->hardirq_disable_ip = ip;
-		curr->hardirq_disable_event = ++curr->irq_events;
+		trace->hardirq_disable_ip = ip;
+		trace->hardirq_disable_event = ++trace->irq_events;
 		debug_atomic_inc(hardirqs_off_events);
 	} else {
 		debug_atomic_inc(redundant_hardirqs_off);
@@ -3802,7 +3804,7 @@ EXPORT_SYMBOL_GPL(lockdep_hardirqs_off);
  */
 void lockdep_softirqs_on(unsigned long ip)
 {
-	struct task_struct *curr = current;
+	struct irqtrace_events *trace = &current->irqtrace;
 
 	if (unlikely(!debug_locks || current->lockdep_recursion))
 		return;
@@ -3814,7 +3816,7 @@ void lockdep_softirqs_on(unsigned long ip)
 	if (DEBUG_LOCKS_WARN_ON(!irqs_disabled()))
 		return;
 
-	if (curr->softirqs_enabled) {
+	if (current->softirqs_enabled) {
 		debug_atomic_inc(redundant_softirqs_on);
 		return;
 	}
@@ -3823,9 +3825,9 @@ void lockdep_softirqs_on(unsigned long ip)
 	/*
 	 * We'll do an OFF -> ON transition:
 	 */
-	curr->softirqs_enabled = 1;
-	curr->softirq_enable_ip = ip;
-	curr->softirq_enable_event = ++curr->irq_events;
+	current->softirqs_enabled = 1;
+	trace->softirq_enable_ip = ip;
+	trace->softirq_enable_event = ++trace->irq_events;
 	debug_atomic_inc(softirqs_on_events);
 	/*
 	 * We are going to turn softirqs on, so set the
@@ -3833,7 +3835,7 @@ void lockdep_softirqs_on(unsigned long ip)
 	 * enabled too:
 	 */
 	if (lockdep_hardirqs_enabled())
-		mark_held_locks(curr, LOCK_ENABLED_SOFTIRQ);
+		mark_held_locks(current, LOCK_ENABLED_SOFTIRQ);
 	lockdep_recursion_finish();
 }
 
@@ -3842,8 +3844,6 @@ void lockdep_softirqs_on(unsigned long ip)
  */
 void lockdep_softirqs_off(unsigned long ip)
 {
-	struct task_struct *curr = current;
-
 	if (unlikely(!debug_locks || current->lockdep_recursion))
 		return;
 
@@ -3853,13 +3853,15 @@ void lockdep_softirqs_off(unsigned long ip)
 	if (DEBUG_LOCKS_WARN_ON(!irqs_disabled()))
 		return;
 
-	if (curr->softirqs_enabled) {
+	if (current->softirqs_enabled) {
+		struct irqtrace_events *trace = &current->irqtrace;
+
 		/*
 		 * We have done an ON -> OFF transition:
 		 */
-		curr->softirqs_enabled = 0;
-		curr->softirq_disable_ip = ip;
-		curr->softirq_disable_event = ++curr->irq_events;
+		current->softirqs_enabled = 0;
+		trace->softirq_disable_ip = ip;
+		trace->softirq_disable_event = ++trace->irq_events;
 		debug_atomic_inc(softirqs_off_events);
 		/*
 		 * Whoops, we wanted softirqs off, so why aren't they?

From 92c209ac6d3d35783c16c8a717547183e6e11162 Mon Sep 17 00:00:00 2001
From: Marco Elver <elver@google.com>
Date: Wed, 29 Jul 2020 13:09:16 +0200
Subject: [PATCH 498/502] kcsan: Improve IRQ state trace reporting

To improve the general usefulness of the IRQ state trace events with
KCSAN enabled, save and restore the trace information when entering and
exiting the KCSAN runtime as well as when generating a KCSAN report.

Without this, reporting the IRQ trace events (whether via a KCSAN report
or outside of KCSAN via a lockdep report) is rather useless due to
continuously being touched by KCSAN. This is because if KCSAN is
enabled, every instrumented memory access causes changes to IRQ trace
events (either by KCSAN disabling/enabling interrupts or taking
report_lock when generating a report).

Before "lockdep: Prepare for NMI IRQ state tracking", KCSAN avoided
touching the IRQ trace events via raw_local_irq_save/restore() and
lockdep_off/on().

Fixes: 248591f5d257 ("kcsan: Make KCSAN compatible with new IRQ state tracking")
Signed-off-by: Marco Elver <elver@google.com>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Link: https://lore.kernel.org/r/20200729110916.3920464-2-elver@google.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/sched.h |  4 ++++
 kernel/kcsan/core.c   | 23 +++++++++++++++++++++++
 kernel/kcsan/kcsan.h  |  7 +++++++
 kernel/kcsan/report.c |  3 +++
 4 files changed, 37 insertions(+)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 52e0fdd6a555..060e9214c8b5 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1184,8 +1184,12 @@ struct task_struct {
 #ifdef CONFIG_KASAN
 	unsigned int			kasan_depth;
 #endif
+
 #ifdef CONFIG_KCSAN
 	struct kcsan_ctx		kcsan_ctx;
+#ifdef CONFIG_TRACE_IRQFLAGS
+	struct irqtrace_events		kcsan_save_irqtrace;
+#endif
 #endif
 
 #ifdef CONFIG_FUNCTION_GRAPH_TRACER
diff --git a/kernel/kcsan/core.c b/kernel/kcsan/core.c
index 732623c30359..0fe068192781 100644
--- a/kernel/kcsan/core.c
+++ b/kernel/kcsan/core.c
@@ -291,6 +291,20 @@ static inline unsigned int get_delay(void)
 				0);
 }
 
+void kcsan_save_irqtrace(struct task_struct *task)
+{
+#ifdef CONFIG_TRACE_IRQFLAGS
+	task->kcsan_save_irqtrace = task->irqtrace;
+#endif
+}
+
+void kcsan_restore_irqtrace(struct task_struct *task)
+{
+#ifdef CONFIG_TRACE_IRQFLAGS
+	task->irqtrace = task->kcsan_save_irqtrace;
+#endif
+}
+
 /*
  * Pull everything together: check_access() below contains the performance
  * critical operations; the fast-path (including check_access) functions should
@@ -336,9 +350,11 @@ static noinline void kcsan_found_watchpoint(const volatile void *ptr,
 	flags = user_access_save();
 
 	if (consumed) {
+		kcsan_save_irqtrace(current);
 		kcsan_report(ptr, size, type, KCSAN_VALUE_CHANGE_MAYBE,
 			     KCSAN_REPORT_CONSUMED_WATCHPOINT,
 			     watchpoint - watchpoints);
+		kcsan_restore_irqtrace(current);
 	} else {
 		/*
 		 * The other thread may not print any diagnostics, as it has
@@ -396,6 +412,12 @@ kcsan_setup_watchpoint(const volatile void *ptr, size_t size, int type)
 		goto out;
 	}
 
+	/*
+	 * Save and restore the IRQ state trace touched by KCSAN, since KCSAN's
+	 * runtime is entered for every memory access, and potentially useful
+	 * information is lost if dirtied by KCSAN.
+	 */
+	kcsan_save_irqtrace(current);
 	if (!kcsan_interrupt_watcher)
 		local_irq_save(irq_flags);
 
@@ -539,6 +561,7 @@ kcsan_setup_watchpoint(const volatile void *ptr, size_t size, int type)
 out_unlock:
 	if (!kcsan_interrupt_watcher)
 		local_irq_restore(irq_flags);
+	kcsan_restore_irqtrace(current);
 out:
 	user_access_restore(ua_flags);
 }
diff --git a/kernel/kcsan/kcsan.h b/kernel/kcsan/kcsan.h
index 763d6d08d94b..29480010dc30 100644
--- a/kernel/kcsan/kcsan.h
+++ b/kernel/kcsan/kcsan.h
@@ -9,6 +9,7 @@
 #define _KERNEL_KCSAN_KCSAN_H
 
 #include <linux/kcsan.h>
+#include <linux/sched.h>
 
 /* The number of adjacent watchpoints to check. */
 #define KCSAN_CHECK_ADJACENT 1
@@ -22,6 +23,12 @@ extern unsigned int kcsan_udelay_interrupt;
  */
 extern bool kcsan_enabled;
 
+/*
+ * Save/restore IRQ flags state trace dirtied by KCSAN.
+ */
+void kcsan_save_irqtrace(struct task_struct *task);
+void kcsan_restore_irqtrace(struct task_struct *task);
+
 /*
  * Initialize debugfs file.
  */
diff --git a/kernel/kcsan/report.c b/kernel/kcsan/report.c
index 6b2fb1a6d8cd..9d07e175de0f 100644
--- a/kernel/kcsan/report.c
+++ b/kernel/kcsan/report.c
@@ -308,6 +308,9 @@ static void print_verbose_info(struct task_struct *task)
 	if (!task)
 		return;
 
+	/* Restore IRQ state trace for printing. */
+	kcsan_restore_irqtrace(task);
+
 	pr_err("\n");
 	debug_show_held_locks(task);
 	print_irqtrace_events(task);

From d1719f70d0a5b83b12786a7dbc5b9fe396469016 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@kernel.dk>
Date: Thu, 30 Jul 2020 13:43:53 -0600
Subject: [PATCH 499/502] io_uring: don't touch 'ctx' after installing file
 descriptor

As soon as we install the file descriptor, we have to assume that it
can get arbitrarily closed. We currently account memory (and note that
we did) after installing the ring fd, which means that it could be a
potential use-after-free condition if the fd is closed right after
being installed, but before we fiddle with the ctx.

In fact, syzbot reported this exact scenario:

BUG: KASAN: use-after-free in io_account_mem fs/io_uring.c:7397 [inline]
BUG: KASAN: use-after-free in io_uring_create fs/io_uring.c:8369 [inline]
BUG: KASAN: use-after-free in io_uring_setup+0x2797/0x2910 fs/io_uring.c:8400
Read of size 1 at addr ffff888087a41044 by task syz-executor.5/18145

CPU: 0 PID: 18145 Comm: syz-executor.5 Not tainted 5.8.0-rc7-next-20200729-syzkaller #0
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011
Call Trace:
 __dump_stack lib/dump_stack.c:77 [inline]
 dump_stack+0x18f/0x20d lib/dump_stack.c:118
 print_address_description.constprop.0.cold+0xae/0x497 mm/kasan/report.c:383
 __kasan_report mm/kasan/report.c:513 [inline]
 kasan_report.cold+0x1f/0x37 mm/kasan/report.c:530
 io_account_mem fs/io_uring.c:7397 [inline]
 io_uring_create fs/io_uring.c:8369 [inline]
 io_uring_setup+0x2797/0x2910 fs/io_uring.c:8400
 do_syscall_64+0x2d/0x70 arch/x86/entry/common.c:46
 entry_SYSCALL_64_after_hwframe+0x44/0xa9
RIP: 0033:0x45c429
Code: 8d b6 fb ff c3 66 2e 0f 1f 84 00 00 00 00 00 66 90 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 0f 83 5b b6 fb ff c3 66 2e 0f 1f 84 00 00 00 00
RSP: 002b:00007f8f121d0c78 EFLAGS: 00000246 ORIG_RAX: 00000000000001a9
RAX: ffffffffffffffda RBX: 0000000000008540 RCX: 000000000045c429
RDX: 0000000000000000 RSI: 0000000020000040 RDI: 0000000000000196
RBP: 000000000078bf38 R08: 0000000000000000 R09: 0000000000000000
R10: 0000000000000000 R11: 0000000000000246 R12: 000000000078bf0c
R13: 00007fff86698cff R14: 00007f8f121d19c0 R15: 000000000078bf0c

Move the accounting of the ring used locked memory before we get and
install the ring file descriptor.

Cc: stable@vger.kernel.org
Reported-by: syzbot+9d46305e76057f30c74e@syzkaller.appspotmail.com
Fixes: 309758254ea6 ("io_uring: report pinned memory usage")
Reviewed-by: Stefano Garzarella <sgarzare@redhat.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index fabf0b692384..33702f3b5af8 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -8329,6 +8329,15 @@ static int io_uring_create(unsigned entries, struct io_uring_params *p,
 		ret = -EFAULT;
 		goto err;
 	}
+
+	/*
+	 * Account memory _before_ installing the file descriptor. Once
+	 * the descriptor is installed, it can get closed at any time.
+	 */
+	io_account_mem(ctx, ring_pages(p->sq_entries, p->cq_entries),
+		       ACCT_LOCKED);
+	ctx->limit_mem = limit_mem;
+
 	/*
 	 * Install ring fd as the very last thing, so we don't risk someone
 	 * having closed it before we finish setup
@@ -8338,9 +8347,6 @@ static int io_uring_create(unsigned entries, struct io_uring_params *p,
 		goto err;
 
 	trace_io_uring_create(ret, ctx, p->sq_entries, p->cq_entries, p->flags);
-	io_account_mem(ctx, ring_pages(p->sq_entries, p->cq_entries),
-		       ACCT_LOCKED);
-	ctx->limit_mem = limit_mem;
 	return ret;
 err:
 	io_ring_ctx_wait_and_kill(ctx);

From 338c11e94e160f80d8352bf9b5da82dd1a910d2f Mon Sep 17 00:00:00 2001
From: Maninder Singh <maninder1.s@samsung.com>
Date: Fri, 31 Jul 2020 17:19:50 +0530
Subject: [PATCH 500/502] arm64: use IRQ_STACK_SIZE instead of THREAD_SIZE for
 irq stack

IRQ_STACK_SIZE can be made different from THREAD_SIZE,
and as IRQ_STACK_SIZE is used while irq stack allocation,
same define should be used while printing information of irq stack.

Signed-off-by: Maninder Singh <maninder1.s@samsung.com>
Acked-by: Mark Rutland <mark.rutland@arm.com>
Link: https://lore.kernel.org/r/1596196190-14141-1-git-send-email-maninder1.s@samsung.com
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 arch/arm64/kernel/traps.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/arm64/kernel/traps.c b/arch/arm64/kernel/traps.c
index 47f651df781c..13ebd5ca2070 100644
--- a/arch/arm64/kernel/traps.c
+++ b/arch/arm64/kernel/traps.c
@@ -855,7 +855,7 @@ asmlinkage void handle_bad_stack(struct pt_regs *regs)
 	pr_emerg("Task stack:     [0x%016lx..0x%016lx]\n",
 		 tsk_stk, tsk_stk + THREAD_SIZE);
 	pr_emerg("IRQ stack:      [0x%016lx..0x%016lx]\n",
-		 irq_stk, irq_stk + THREAD_SIZE);
+		 irq_stk, irq_stk + IRQ_STACK_SIZE);
 	pr_emerg("Overflow stack: [0x%016lx..0x%016lx]\n",
 		 ovf_stk, ovf_stk + OVERFLOW_STACK_SIZE);
 

From 1752f0adea98ef859978c090e0726844348758f9 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Sat, 1 Aug 2020 13:36:33 +0300
Subject: [PATCH 501/502] fs: optimise kiocb_set_rw_flags()

Use a local var to collect flags in kiocb_set_rw_flags(). That spares
some memory writes and allows to replace most of the jumps with MOVEcc.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Reviewed-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/fs.h | 16 +++++++++++-----
 1 file changed, 11 insertions(+), 5 deletions(-)

diff --git a/include/linux/fs.h b/include/linux/fs.h
index 4090320360f4..e535543d31d9 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -3446,22 +3446,28 @@ static inline int iocb_flags(struct file *file)
 
 static inline int kiocb_set_rw_flags(struct kiocb *ki, rwf_t flags)
 {
+	int kiocb_flags = 0;
+
+	if (!flags)
+		return 0;
 	if (unlikely(flags & ~RWF_SUPPORTED))
 		return -EOPNOTSUPP;
 
 	if (flags & RWF_NOWAIT) {
 		if (!(ki->ki_filp->f_mode & FMODE_NOWAIT))
 			return -EOPNOTSUPP;
-		ki->ki_flags |= IOCB_NOWAIT;
+		kiocb_flags |= IOCB_NOWAIT;
 	}
 	if (flags & RWF_HIPRI)
-		ki->ki_flags |= IOCB_HIPRI;
+		kiocb_flags |= IOCB_HIPRI;
 	if (flags & RWF_DSYNC)
-		ki->ki_flags |= IOCB_DSYNC;
+		kiocb_flags |= IOCB_DSYNC;
 	if (flags & RWF_SYNC)
-		ki->ki_flags |= (IOCB_DSYNC | IOCB_SYNC);
+		kiocb_flags |= (IOCB_DSYNC | IOCB_SYNC);
 	if (flags & RWF_APPEND)
-		ki->ki_flags |= IOCB_APPEND;
+		kiocb_flags |= IOCB_APPEND;
+
+	ki->ki_flags |= kiocb_flags;
 	return 0;
 }
 

From fa15bafb71fd7a4d6018dae87cfaf890fd4ab47f Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Sat, 1 Aug 2020 13:50:02 +0300
Subject: [PATCH 502/502] io_uring: flip if handling after io_setup_async_rw

As recently done with with send/recv, flip the if after
rw_verify_aread() in io_{read,write}() and tabulise left bits left.
This removes mispredicted by a compiler jump on the success/fast path.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/io_uring.c | 146 +++++++++++++++++++++++++-------------------------
 1 file changed, 72 insertions(+), 74 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 33702f3b5af8..6fd0b0f5df68 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -3034,57 +3034,56 @@ static int io_read(struct io_kiocb *req, bool force_nonblock,
 	struct kiocb *kiocb = &req->rw.kiocb;
 	struct iov_iter iter;
 	size_t iov_count;
-	ssize_t io_size, ret;
+	ssize_t io_size, ret, ret2;
+	unsigned long nr_segs;
 
 	ret = io_import_iovec(READ, req, &iovec, &iter, !force_nonblock);
 	if (ret < 0)
 		return ret;
+	io_size = ret;
+	req->result = io_size;
 
 	/* Ensure we clear previously set non-block flag */
 	if (!force_nonblock)
 		kiocb->ki_flags &= ~IOCB_NOWAIT;
 
-	io_size = ret;
-	req->result = io_size;
-
 	/* If the file doesn't support async, just async punt */
 	if (force_nonblock && !io_file_supports_async(req->file, READ))
 		goto copy_iov;
 
 	iov_count = iov_iter_count(&iter);
+	nr_segs = iter.nr_segs;
 	ret = rw_verify_area(READ, req->file, &kiocb->ki_pos, iov_count);
-	if (!ret) {
-		unsigned long nr_segs = iter.nr_segs;
-		ssize_t ret2 = 0;
+	if (unlikely(ret))
+		goto out_free;
 
-		ret2 = io_iter_do_read(req, &iter);
+	ret2 = io_iter_do_read(req, &iter);
 
-		/* Catch -EAGAIN return for forced non-blocking submission */
-		if (!force_nonblock || (ret2 != -EAGAIN && ret2 != -EIO)) {
-			kiocb_done(kiocb, ret2, cs);
-		} else {
-			iter.count = iov_count;
-			iter.nr_segs = nr_segs;
+	/* Catch -EAGAIN return for forced non-blocking submission */
+	if (!force_nonblock || (ret2 != -EAGAIN && ret2 != -EIO)) {
+		kiocb_done(kiocb, ret2, cs);
+	} else {
+		iter.count = iov_count;
+		iter.nr_segs = nr_segs;
 copy_iov:
-			ret = io_setup_async_rw(req, io_size, iovec,
-						inline_vecs, &iter);
-			if (ret)
+		ret = io_setup_async_rw(req, io_size, iovec, inline_vecs,
+					&iter);
+		if (ret)
+			goto out_free;
+		/* it's copied and will be cleaned with ->io */
+		iovec = NULL;
+		/* if we can retry, do so with the callbacks armed */
+		if (io_rw_should_retry(req)) {
+			ret2 = io_iter_do_read(req, &iter);
+			if (ret2 == -EIOCBQUEUED) {
+				goto out_free;
+			} else if (ret2 != -EAGAIN) {
+				kiocb_done(kiocb, ret2, cs);
 				goto out_free;
-			/* it's copied and will be cleaned with ->io */
-			iovec = NULL;
-			/* if we can retry, do so with the callbacks armed */
-			if (io_rw_should_retry(req)) {
-				ret2 = io_iter_do_read(req, &iter);
-				if (ret2 == -EIOCBQUEUED) {
-					goto out_free;
-				} else if (ret2 != -EAGAIN) {
-					kiocb_done(kiocb, ret2, cs);
-					goto out_free;
-				}
 			}
-			kiocb->ki_flags &= ~IOCB_WAITQ;
-			return -EAGAIN;
 		}
+		kiocb->ki_flags &= ~IOCB_WAITQ;
+		return -EAGAIN;
 	}
 out_free:
 	if (iovec)
@@ -3117,19 +3116,19 @@ static int io_write(struct io_kiocb *req, bool force_nonblock,
 	struct kiocb *kiocb = &req->rw.kiocb;
 	struct iov_iter iter;
 	size_t iov_count;
-	ssize_t ret, io_size;
+	ssize_t ret, ret2, io_size;
+	unsigned long nr_segs;
 
 	ret = io_import_iovec(WRITE, req, &iovec, &iter, !force_nonblock);
 	if (ret < 0)
 		return ret;
+	io_size = ret;
+	req->result = io_size;
 
 	/* Ensure we clear previously set non-block flag */
 	if (!force_nonblock)
 		req->rw.kiocb.ki_flags &= ~IOCB_NOWAIT;
 
-	io_size = ret;
-	req->result = io_size;
-
 	/* If the file doesn't support async, just async punt */
 	if (force_nonblock && !io_file_supports_async(req->file, WRITE))
 		goto copy_iov;
@@ -3140,51 +3139,50 @@ static int io_write(struct io_kiocb *req, bool force_nonblock,
 		goto copy_iov;
 
 	iov_count = iov_iter_count(&iter);
+	nr_segs = iter.nr_segs;
 	ret = rw_verify_area(WRITE, req->file, &kiocb->ki_pos, iov_count);
-	if (!ret) {
-		unsigned long nr_segs = iter.nr_segs;
-		ssize_t ret2;
+	if (unlikely(ret))
+		goto out_free;
 
-		/*
-		 * Open-code file_start_write here to grab freeze protection,
-		 * which will be released by another thread in
-		 * io_complete_rw().  Fool lockdep by telling it the lock got
-		 * released so that it doesn't complain about the held lock when
-		 * we return to userspace.
-		 */
-		if (req->flags & REQ_F_ISREG) {
-			__sb_start_write(file_inode(req->file)->i_sb,
-						SB_FREEZE_WRITE, true);
-			__sb_writers_release(file_inode(req->file)->i_sb,
-						SB_FREEZE_WRITE);
-		}
-		kiocb->ki_flags |= IOCB_WRITE;
+	/*
+	 * Open-code file_start_write here to grab freeze protection,
+	 * which will be released by another thread in
+	 * io_complete_rw().  Fool lockdep by telling it the lock got
+	 * released so that it doesn't complain about the held lock when
+	 * we return to userspace.
+	 */
+	if (req->flags & REQ_F_ISREG) {
+		__sb_start_write(file_inode(req->file)->i_sb,
+					SB_FREEZE_WRITE, true);
+		__sb_writers_release(file_inode(req->file)->i_sb,
+					SB_FREEZE_WRITE);
+	}
+	kiocb->ki_flags |= IOCB_WRITE;
 
-		if (req->file->f_op->write_iter)
-			ret2 = call_write_iter(req->file, kiocb, &iter);
-		else
-			ret2 = loop_rw_iter(WRITE, req->file, kiocb, &iter);
+	if (req->file->f_op->write_iter)
+		ret2 = call_write_iter(req->file, kiocb, &iter);
+	else
+		ret2 = loop_rw_iter(WRITE, req->file, kiocb, &iter);
 
-		/*
-		 * Raw bdev writes will return -EOPNOTSUPP for IOCB_NOWAIT. Just
-		 * retry them without IOCB_NOWAIT.
-		 */
-		if (ret2 == -EOPNOTSUPP && (kiocb->ki_flags & IOCB_NOWAIT))
-			ret2 = -EAGAIN;
-		if (!force_nonblock || ret2 != -EAGAIN) {
-			kiocb_done(kiocb, ret2, cs);
-		} else {
-			iter.count = iov_count;
-			iter.nr_segs = nr_segs;
+	/*
+	 * Raw bdev writes will return -EOPNOTSUPP for IOCB_NOWAIT. Just
+	 * retry them without IOCB_NOWAIT.
+	 */
+	if (ret2 == -EOPNOTSUPP && (kiocb->ki_flags & IOCB_NOWAIT))
+		ret2 = -EAGAIN;
+	if (!force_nonblock || ret2 != -EAGAIN) {
+		kiocb_done(kiocb, ret2, cs);
+	} else {
+		iter.count = iov_count;
+		iter.nr_segs = nr_segs;
 copy_iov:
-			ret = io_setup_async_rw(req, io_size, iovec,
-						inline_vecs, &iter);
-			if (ret)
-				goto out_free;
-			/* it's copied and will be cleaned with ->io */
-			iovec = NULL;
-			return -EAGAIN;
-		}
+		ret = io_setup_async_rw(req, io_size, iovec, inline_vecs,
+					&iter);
+		if (ret)
+			goto out_free;
+		/* it's copied and will be cleaned with ->io */
+		iovec = NULL;
+		return -EAGAIN;
 	}
 out_free:
 	if (iovec)