Difference between revisions of "Mediawiki XML page importing"
m (→Create a page (first time): hint to run maintenance) |
m (→Bash Script to Convert or Modify a Special:Export: +Note comments) |
||
(5 intermediate revisions by the same user not shown) | |||
Line 140: | Line 140: | ||
# When importing through the web interface, additional versions are created, with the date of import. In this case the sequence of imports rather than dates counts, because these additional versions get the date/time of import! - Avoid using the web interface, when importing versions! | # When importing through the web interface, additional versions are created, with the date of import. In this case the sequence of imports rather than dates counts, because these additional versions get the date/time of import! - Avoid using the web interface, when importing versions! | ||
− | + | Tips: The Wiki code inside the <code><nowiki><text xml:space="preserve">…</text></nowiki></code> must be encoded properly, thee Importer will not complain if you have a <code><nowiki><i>Text</i></nowiki></code> in the XML text and you wonder, what’s the reason. Replace at least the following characters: | |
+ | {| class="wikitable" | ||
+ | |- | ||
+ | ! normal text !! encoded text | ||
+ | |- | ||
+ | | > || &gt; | ||
+ | |- | ||
+ | | < || &lt; | ||
+ | |- | ||
+ | | & || &amp; | ||
+ | |} | ||
+ | For instance: <code>&nbsp;</code> must be encoded as <code>&amp;nbsp;</code>! | ||
+ | |||
+ | |||
+ | == Scrip Approach (Bash, {{abbr|Sed}}) == | ||
+ | |||
+ | <ol> | ||
+ | <li>export page(s) via Special:Export</li> | ||
+ | <li>get the XML header of the Import-Wiki and save it locally</li> | ||
+ | <li>edit settings section of bash script and run following bash script</li> | ||
+ | <li>check if it is valid XML</li> | ||
+ | <li>import XML either Special:Import or via command line (no blank after <code>\</code> or any other character!) | ||
+ | <syntaxhighlight lang="bash"> | ||
+ | cd /var/www/v-awikipath/here && \ | ||
+ | sudo -u www-data php ./maintenance/importDump.php --conf ./LocalSettings.php /path/to/Page-XML-Import.xml && \ | ||
+ | sudo -u www-data php ./extensions/TitleKey/rebuildTitleKeys.php --conf ./LocalSettings.php && \ | ||
+ | sudo -u www-data php ./maintenance/rebuildrecentchanges.php --conf ./LocalSettings.php | ||
+ | </syntaxhighlight> | ||
+ | </li> | ||
+ | </ol> | ||
+ | === Bash Script to Convert or Modify a Special:Export === | ||
+ | |||
+ | Bash script: <code>./convert_mediawiki-export4reimport.sh</code>. You can make the script executable by | ||
+ | <span style="color:gray"># '''u o g''' means: user who owns it ('''u'''), other users in the file's group ('''g'''), other users ('''o''')</span> | ||
+ | chmod ug+x convert_mediawiki-export4reimport.sh <span style="color:gray"># add executable mode for the owned user</span> | ||
+ | |||
+ | Usage: | ||
+ | <syntaxhighlight lang="bash"> | ||
+ | ./convert_mediawiki-export4reimport.sh # just show what the output would be | ||
+ | ./convert_mediawiki-export4reimport.sh > Reimport_wikiname_what-kind_` date '+%Y-%m-%d_%H-%M'`.xml # export to file with a timestamp | ||
+ | </syntaxhighlight> | ||
+ | |||
+ | The script is considered to be a kind of a fragile script because it depends on space indentation and the assumption that a comment (if present) follows almost right after <contributor></contributor>. But it can be used to do all necessary replacements in on single step | ||
+ | <syntaxhighlight lang="bash"> | ||
+ | #!/bin/bash | ||
+ | # @description: Convert a XML Wiki export to a new XML re-impoert-file, add comment, user name and user id as specified | ||
+ | # Usage | ||
+ | # ./convert_mediawiki-export4reimport.sh > Reimport_what_wikiname_` date '+%Y-%m-%d_%H-%M'`.xml | ||
+ | # @dependency binary sed | ||
+ | # @dependency saved file of the XML header until closing </siteinfo> of the Re-import-Wiki before all <page> start | ||
+ | |||
+ | ######################### | ||
+ | # Settings section | ||
+ | wiki_user_name="The Wiki User Name" | ||
+ | wiki_user_id="XXX" # the corresponding user id | ||
+ | reimport_comment="re import of ..." | ||
+ | |||
+ | # reimport_header_file_path: a Special:Export | ||
+ | export_file_path="/home/my-name/Documents/Wiki/Export/WikiName_Export_What_20160307092745.xml" | ||
+ | # reimport_header_file_path: can be any arbitrary Special:Export from the reimport-Wiki. The script only extracts the part it needs | ||
+ | reimport_header_file_path="/home/my-name/Documents/Wiki/Import/onwiki_header.xml" | ||
+ | ######################### | ||
+ | |||
+ | if [[ ! -e $reimport_header_file_path ]];then | ||
+ | echo -e "Error in $0" | ||
+ | echo -e "Header file $reimport_header_file_path \e[1mdoes not exist!!\e[0m (stop)" | ||
+ | exit 1; | ||
+ | fi | ||
+ | if [[ ! -e $export_file_path ]];then | ||
+ | echo -e "Error in $0" | ||
+ | echo -e "Wiki export file $export_file_path \e[1mdoes not exist!!\e[0m (stop)" | ||
+ | exit 1; | ||
+ | fi | ||
+ | |||
+ | |||
+ | sed --silent '/<mediawiki/,/<\/siteinfo>/{p;}' "${reimport_header_file_path}" | ||
+ | sed --silent '/<page>/,/<\/page>/{p;};/<minor\/>/{d}' "${export_file_path}" | sed " | ||
+ | # general replacements | ||
+ | /<contributor>/,/<\/contributor>/{ | ||
+ | s@<username>[^<]*</username>@<username>${wiki_user_name}</username>@g | ||
+ | s@<id>[^<]*</id>@<id>${wiki_user_id}</id>@g | ||
+ | /<\/contributor>/ { | ||
+ | N; | ||
+ | /<comment>/b comment_found | ||
+ | /<comment>/!b comment_not_found | ||
+ | } | ||
+ | }; | ||
+ | :comment_not_found | ||
+ | /<\/contributor>/ { # append comment | ||
+ | a\ | ||
+ | \ \ \ \ \ \ <comment>${reimport_comment}</comment> | ||
+ | } | ||
+ | :comment_found | ||
+ | /<comment>/,/<\/comment>/{ | ||
+ | :label_add_newlines | ||
+ | N; | ||
+ | # if line contains not (!) '</comment>' go (b)ack to label_add_newlines | ||
+ | /<\/comment>/!b label_add_newlines | ||
+ | s@\(<comment>\)\(.\+\)\(</comment>\)@\1${reimport_comment}\3@g; | ||
+ | } | ||
+ | |||
+ | # perhaps additional sed replacement commands here | ||
+ | " | sed " | ||
+ | # (d)elete lines with tags not wanted for re import | ||
+ | /^ <id>/{d}; | ||
+ | /^ <id>/{d}; | ||
+ | /^ <format>/{d}; | ||
+ | /^ <model>/{d}; | ||
+ | /^ <parentid>/{d}; | ||
+ | /^ <sha1>/{d}; | ||
+ | /^ <timestamp>/{d}; | ||
+ | s@<text xml:space=\"preserve\" bytes=\"[0-9]\+\"@<text xml:space=\"preserve\" @g; | ||
+ | " | ||
+ | echo "</mediawiki>" | ||
+ | </syntaxhighlight> | ||
---- | ---- | ||
Line 148: | Line 262: | ||
[[Category: Update templates]] | [[Category: Update templates]] | ||
[[Category: Import]] | [[Category: Import]] | ||
+ | [[Category:MediaWiki]] | ||
+ | [[Category:Export]] | ||
+ | [[Category:Bash script]] |
Latest revision as of 13:41, 8 March 2016
This is about importing the text of wiki pages using an xml format. For images or other files see: Batch importing files into MediaWiki.
Contents
Creating an export to be then reimported
Use Command Line (shell access to server required!):
cd /var/www/testwiki; php ./maintenance/dumpBackup.php --full --conf ./LocalSettings.php > ./testwikiexport.xml
(See also DumpBackup.php)
or the special page Special:Export. Highly interesting: Parameters_to_Special:Export
Importing Data from Command Line Interface
This is the preferred method, as it does not create additional versions (compare next section).
Import works directly with 7z (or zip, bzip) compressed xml files! HOWEVER, PRESENTLY .7z DOES NOT WORKTransfer the xml file to the server, and execute (example):
cd /var/www/v-xxx/w; sudo php ./maintenance/importDump.php /var/www/v-xxx/w/import.xml --conf ./LocalSettings.php
cd /var/www/v-xxx/w; sudo php ./maintenance/rebuildall.php --conf ./LocalSettings.php
cd /var/www/v-xxx/w; sudo php ./maintenance/runJobs.php --conf ./LocalSettings.php --procs=3
# e.g. FOR OPENMEDIA:
cd /var/www/v-species/o; sudo php ./maintenance/importDump.php ./atmp/import.xml --conf ./LocalSettings.php
cd /var/www/v-species/o; sudo php ./maintenance/rebuildall.php --conf ./LocalSettings.php
cd /var/www/v-species/o; sudo php ./maintenance/runJobs.php --conf ./LocalSettings.php --procs=3
# e.g. FOR Naturführer:
cd /var/www/v-on/w; sudo php ./maintenance/importDump.php ./atmp/import.xml --conf ./LocalSettings.php
cd /var/www/v-on/w; sudo php ./maintenance/rebuildall.php --conf ./LocalSettings.php
cd /var/www/v-on/w; sudo php ./maintenance/runJobs.php --conf ./LocalSettings.php --procs=3
(Rebuilding internal indices is necessary after import; rebuildall may be slow and can be replaced with
cd /var/www/v-on/w; sudo php ./maintenance/rebuildrecentchanges.php --conf ./LocalSettings.php
if necessary. RunJobs: if import contains complex template relations or when updating template relations, data entries in templates, manually emptying the job queue may be necessary, check Special:Statistics in the wiki. Note: "--procs=3" will run three jobs in parallel, if the server has the necessary number of processor cores.)
Important: for all batchimporting, revisiondate must be set to something newer than all old revisions; else mediawiki will sort the imported revision behind existing revisions. The id in the imported xml is not necessary, however.
Batch deleting pages
Example:
sudo php ./maintenance/deleteBatch.php \
--conf ./LocalSettings.php \
-r "remove wrong resolution" ./maintenance/deleteBatch.txt
deleteBatch.txt contains only the filenames.
Note that for File:x.jpg pages, this will delete the file itself AND the page itself, but the file will still seem to exist when called. Only manually clicking "delete all" in file history will finish this. This is probably a bug, the php code attempts to handle file deletions.
Importing Data through Special Pages Web Interface
The Web interface under Special:Import will create extra revisions (in addition to those imported) designating the importing user. If you don't want to document who did a transfer, it may therefore be desirable to use the command-line version (see below). For the web import it may be desirable to create a special "Import-User" so that the name better documents authorship than using a normal username during upload of the xml file. Important creates two revisions for each page: Revision 1 is the imported revision, Revision 2 is the revision documenting the import process. If the imported data alone document this (e.g. when they already are using Import-User and an appropriate comment), it is possible to delete the second revisions in the database (assuming Import-User has ID=4):
Delete FROM PREFIX_revision WHERE PREFIX_revision.rev_user=4 AND PREFIX_revision.rev_minor_edit=1; -- Then need to fix the latest revision stored in page: UPDATE PREFIX_revision AS R2 INNER JOIN (PREFIX_page LEFT JOIN PREFIX_revision AS R1 ON PREFIX_page.page_latest=R1.rev_id) ON R2.rev_page=PREFIX_page.page_id SET page_latest=R2.rev_id WHERE R1.rev_id Is Null
Create a page (first time)
To do this, export a single page to get the XML-header and footer you need as a template later on. For creating a page, the important things are:
- <contributor>your user name</contributor> and your ID <id>123</id> (Preferences-tab User profile see Username: + <uid>)
- <title>new title</title> and the
- <text xml:space="preserve">Wiki text</text>
Then put it together with the exported XML document header:
<mediawiki xmlns="http://www.mediawiki.org/xml/export-0.4/" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://www.mediawiki.org/xml/export-0.4/ http://www.mediawiki.org/xml/export-0.4.xsd" version="0.4" xml:lang="en"> <siteinfo> <!-- … the XML header from an arbitrary wiki page export --> </siteinfo> <page> <title>GBIF:cultivar</title> <revision> <contributor><username>User name</username><id>123</id></contributor> <text xml:space="preserve">{{Term | collection = GBIF | short URI = cultivar | full URI = http://vocabularies.gbif.org/services/gbif/taxon_rank/cultivar | label = cultivar | code = cultivar | see also = http://rs.gbif.org/vocabulary/gbif/rank.xml }}</text> </revision> </page> </mediawiki>
This will create the page “GBIF:cultivar”.
Note it is:
<text xml:space="preserve">{{Term
...
}}</text>
| … and not with line break after < xml:space="preserve"> … |
<text xml:space="preserve">
{{Term
...
}}
</text>
|
normal text | encoded text |
---|---|
> | > |
< | < |
& | & |
For instance:
must be encoded as &nbsp;
!
Scrip Approach (Bash, Sed)
- export page(s) via Special:Export
- get the XML header of the Import-Wiki and save it locally
- edit settings section of bash script and run following bash script
- check if it is valid XML
- import XML either Special:Import or via command line (no blank after
\
or any other character!)cd /var/www/v-awikipath/here && \ sudo -u www-data php ./maintenance/importDump.php --conf ./LocalSettings.php /path/to/Page-XML-Import.xml && \ sudo -u www-data php ./extensions/TitleKey/rebuildTitleKeys.php --conf ./LocalSettings.php && \ sudo -u www-data php ./maintenance/rebuildrecentchanges.php --conf ./LocalSettings.php
Bash Script to Convert or Modify a Special:Export
Bash script: ./convert_mediawiki-export4reimport.sh
. You can make the script executable by
# u o g means: user who owns it (u), other users in the file's group (g), other users (o) chmod ug+x convert_mediawiki-export4reimport.sh # add executable mode for the owned user
Usage:
./convert_mediawiki-export4reimport.sh # just show what the output would be
./convert_mediawiki-export4reimport.sh > Reimport_wikiname_what-kind_` date '+%Y-%m-%d_%H-%M'`.xml # export to file with a timestamp
The script is considered to be a kind of a fragile script because it depends on space indentation and the assumption that a comment (if present) follows almost right after <contributor></contributor>. But it can be used to do all necessary replacements in on single step
#!/bin/bash
# @description: Convert a XML Wiki export to a new XML re-impoert-file, add comment, user name and user id as specified
# Usage
# ./convert_mediawiki-export4reimport.sh > Reimport_what_wikiname_` date '+%Y-%m-%d_%H-%M'`.xml
# @dependency binary sed
# @dependency saved file of the XML header until closing </siteinfo> of the Re-import-Wiki before all <page> start
#########################
# Settings section
wiki_user_name="The Wiki User Name"
wiki_user_id="XXX" # the corresponding user id
reimport_comment="re import of ..."
# reimport_header_file_path: a Special:Export
export_file_path="/home/my-name/Documents/Wiki/Export/WikiName_Export_What_20160307092745.xml"
# reimport_header_file_path: can be any arbitrary Special:Export from the reimport-Wiki. The script only extracts the part it needs
reimport_header_file_path="/home/my-name/Documents/Wiki/Import/onwiki_header.xml"
#########################
if [[ ! -e $reimport_header_file_path ]];then
echo -e "Error in $0"
echo -e "Header file $reimport_header_file_path \e[1mdoes not exist!!\e[0m (stop)"
exit 1;
fi
if [[ ! -e $export_file_path ]];then
echo -e "Error in $0"
echo -e "Wiki export file $export_file_path \e[1mdoes not exist!!\e[0m (stop)"
exit 1;
fi
sed --silent '/<mediawiki/,/<\/siteinfo>/{p;}' "${reimport_header_file_path}"
sed --silent '/<page>/,/<\/page>/{p;};/<minor\/>/{d}' "${export_file_path}" | sed "
# general replacements
/<contributor>/,/<\/contributor>/{
s@<username>[^<]*</username>@<username>${wiki_user_name}</username>@g
s@<id>[^<]*</id>@<id>${wiki_user_id}</id>@g
/<\/contributor>/ {
N;
/<comment>/b comment_found
/<comment>/!b comment_not_found
}
};
:comment_not_found
/<\/contributor>/ { # append comment
a\
\ \ \ \ \ \ <comment>${reimport_comment}</comment>
}
:comment_found
/<comment>/,/<\/comment>/{
:label_add_newlines
N;
# if line contains not (!) '</comment>' go (b)ack to label_add_newlines
/<\/comment>/!b label_add_newlines
s@\(<comment>\)\(.\+\)\(</comment>\)@\1${reimport_comment}\3@g;
}
# perhaps additional sed replacement commands here
" | sed "
# (d)elete lines with tags not wanted for re import
/^ <id>/{d};
/^ <id>/{d};
/^ <format>/{d};
/^ <model>/{d};
/^ <parentid>/{d};
/^ <sha1>/{d};
/^ <timestamp>/{d};
s@<text xml:space=\"preserve\" bytes=\"[0-9]\+\"@<text xml:space=\"preserve\" @g;
"
echo "</mediawiki>"
See also: Batch importing files into MediaWiki (that is: images, etc.)